From a9e7717a2bc303b001b5731da76a99682d4995e8 Mon Sep 17 00:00:00 2001
From: Jan Dupej <109523496+jandupej@users.noreply.github.com>
Date: Wed, 26 Apr 2023 16:03:37 +0200
Subject: [PATCH] [mono][jit] Adding Vector128.ConvertXX as intrinsic on arm64.
 (#85163)

* [mono][jit] Adding Vector128.ConvertXX as intrinsic on arm64.

* Changed rounding model on f->i conversion.

* Disabled f32->i32 casting test.

* Disabled all of failing JIT tests.
---
 src/mono/mono/arch/arm64/arm64-codegen.h | 33 ++++++------------------
 src/mono/mono/mini/simd-arm64.h          |  4 +++
 src/mono/mono/mini/simd-intrinsics.c     | 44 ++++++++++++++++++++++++--------
 src/tests/issues.targets                 |  6 +++++
 4 files changed, 52 insertions(+), 35 deletions(-)

diff --git a/src/mono/mono/arch/arm64/arm64-codegen.h b/src/mono/mono/arch/arm64/arm64-codegen.h
index 8593d3f..8729f34 100644
--- a/src/mono/mono/arch/arm64/arm64-codegen.h
+++ b/src/mono/mono/arch/arm64/arm64-codegen.h
@@ -1222,11 +1222,18 @@ arm_encode_arith_imm (int imm, guint32 *shift)
 #define arm_neon_fcvtl(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_2, 0b10111, (rd), (rn))
 #define arm_neon_fcvtl2(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b10111, (rd), (rn))
 
-
 // Parametrized variants of the bitwise opcodes
 //   width - determines if full register or its lower half is used, one of {VREG_LOW, VREG_FULL}
 #define arm_neon_not(p, width, rd, rn) arm_neon_2mvec_opcode ((p), (width), 0b1, 0b00, 0b00101, (rd), (rn))
 
+#define arm_neon_ucvtf(p, width, type, rd, rn) arm_neon_2mvec_opcode ((p), (width), 0b1, (type), 0b11101, (rd), (rn))
+#define arm_neon_scvtf(p, width, type, rd, rn) arm_neon_2mvec_opcode ((p), (width), 0b0, (type), 0b11101, (rd), (rn))
+#define arm_neon_fcvtns(p, width, type, rd, rn) arm_neon_2mvec_opcode ((p), (width), 0b0, (type) - 2, 0b11010, (rd), (rn)) // -2 converts src type to dest type
+#define arm_neon_fcvtnu(p, width, type, rd, rn) arm_neon_2mvec_opcode ((p), (width), 0b1, (type) - 2, 0b11010, (rd), (rn))
+#define arm_neon_fcvtzs(p, width, type, rd, rn) arm_neon_2mvec_opcode ((p), (width), 0b0, 0b10 | (type), 0b11011, (rd), (rn))
+#define arm_neon_fcvtzu(p, width, type, rd, rn) arm_neon_2mvec_opcode ((p), (width), 0b1, 0b10 | (type), 0b11011, (rd), (rn))
+
+
 // Specific opcodes:
 #define arm_neon_rev64_8b(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b00000, (rd), (rn))
 #define arm_neon_rev64_16b(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b00000, (rd), (rn))
@@ -1326,10 +1333,6 @@ arm_encode_arith_imm (int imm, guint32 *shift)
 #define arm_neon_frintm_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b11001, (rd), (rn))
 #define arm_neon_frintm_2d(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b11001, (rd), (rn))
 
-#define arm_neon_fcvtns_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b11010, (rd), (rn))
-#define arm_neon_fcvtns_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b11010, (rd), (rn))
-#define arm_neon_fcvtns_2d(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b11010, (rd), (rn))
-
 #define arm_neon_fcvtms_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b11011, (rd), (rn))
 #define arm_neon_fcvtms_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b11011, (rd), (rn))
 #define arm_neon_fcvtms_2d(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b11011, (rd), (rn))
@@ -1338,10 +1341,6 @@ arm_encode_arith_imm (int imm, guint32 *shift)
 #define arm_neon_fcvtas_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b11100, (rd), (rn))
 #define arm_neon_fcvtas_2d(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b11100, (rd), (rn))
 
-#define arm_neon_scvtf_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b11101, (rd), (rn))
-#define arm_neon_scvtf_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b11101, (rd), (rn))
-#define arm_neon_scvtf_2d(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b11101, (rd), (rn))
-
 #define arm_neon_frint32z_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, SIZE_1, 0b11110, (rd), (rn))
 #define arm_neon_frint32z_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_1, 0b11110, (rd), (rn))
 #define arm_neon_frint32z_2d(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, SIZE_2, 0b11110, (rd), (rn))
@@ -1378,10 +1377,6 @@ arm_encode_arith_imm (int imm, guint32 *shift)
 #define arm_neon_fcvtps_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, 0b10 | SIZE_1, 0b11010, (rd), (rn))
 #define arm_neon_fcvtps_2d(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, 0b10 | SIZE_2, 0b11010, (rd), (rn))
 
-#define arm_neon_fcvtzs_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, 0b10 | SIZE_1, 0b11011, (rd), (rn))
-#define arm_neon_fcvtzs_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, 0b10 | SIZE_1, 0b11011, (rd), (rn))
-#define arm_neon_fcvtzs_2d(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, 0b10 | SIZE_2, 0b11011, (rd), (rn))
-
 #define arm_neon_urecpe_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b0, 0b10 | SIZE_1, 0b11100, (rd), (rn))
 #define arm_neon_urecpe_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b0, 0b10 | SIZE_1, 0b11100, (rd), (rn))
 
@@ -1491,10 +1486,6 @@ arm_encode_arith_imm (int imm, guint32 *shift)
 #define arm_neon_frintx_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b1, SIZE_1, 0b11001, (rd), (rn))
 #define arm_neon_frintx_2d(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b1, SIZE_2, 0b11001, (rd), (rn))
 
-#define arm_neon_fcvtnu_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b1, SIZE_1, 0b11010, (rd), (rn))
-#define arm_neon_fcvtnu_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b1, SIZE_1, 0b11010, (rd), (rn))
-#define arm_neon_fcvtnu_2d(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b1, SIZE_2, 0b11010, (rd), (rn))
-
 #define arm_neon_fcvtmu_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b1, SIZE_1, 0b11011, (rd), (rn))
 #define arm_neon_fcvtmu_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b1, SIZE_1, 0b11011, (rd), (rn))
 #define arm_neon_fcvtmu_2d(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b1, SIZE_2, 0b11011, (rd), (rn))
@@ -1503,10 +1494,6 @@ arm_encode_arith_imm (int imm, guint32 *shift)
 #define arm_neon_fcvtau_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b1, SIZE_1, 0b11100, (rd), (rn))
 #define arm_neon_fcvtau_2d(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b1, SIZE_2, 0b11100, (rd), (rn))
 
-#define arm_neon_ucvtf_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b1, SIZE_1, 0b11101, (rd), (rn))
-#define arm_neon_ucvtf_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b1, SIZE_1, 0b11101, (rd), (rn))
-#define arm_neon_ucvtf_2d(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b1, SIZE_2, 0b11101, (rd), (rn))
-
 #define arm_neon_frint32x_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b1, SIZE_1, 0b11110, (rd), (rn))
 #define arm_neon_frint32x_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b1, SIZE_1, 0b11110, (rd), (rn))
 #define arm_neon_frint32x_2d(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b1, SIZE_2, 0b11110, (rd), (rn))
@@ -1541,10 +1528,6 @@ arm_encode_arith_imm (int imm, guint32 *shift)
 #define arm_neon_fcvtpu_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b1, 0b10 | SIZE_1, 0b11010, (rd), (rn))
 #define arm_neon_fcvtpu_2d(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b1, 0b10 | SIZE_2, 0b11010, (rd), (rn))
 
-#define arm_neon_fcvtzu_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b1, 0b10 | SIZE_1, 0b11011, (rd), (rn))
-#define arm_neon_fcvtzu_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b1, 0b10 | SIZE_1, 0b11011, (rd), (rn))
-#define arm_neon_fcvtzu_2d(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b1, 0b10 | SIZE_2, 0b11011, (rd), (rn))
-
 #define arm_neon_ursqrte_2s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_LOW, 0b1, 0b10 | SIZE_1, 0b11100, (rd), (rn))
 #define arm_neon_ursqrte_4s(p, rd, rn) arm_neon_2mvec_opcode ((p), VREG_FULL, 0b1, 0b10 | SIZE_1, 0b11100, (rd), (rn))
 
diff --git a/src/mono/mono/mini/simd-arm64.h b/src/mono/mono/mini/simd-arm64.h
index b9c0b75..9bd2c31 100644
--- a/src/mono/mono/mini/simd-arm64.h
+++ b/src/mono/mono/mini/simd-arm64.h
@@ -55,6 +55,10 @@ SIMD_OP  (128, OP_XUNOP,       OP_ARM64_SXTL,        TDS,                arm_neo
 SIMD_OP  (128, OP_XUNOP,       OP_ARM64_SXTL2,       TDS,                arm_neon_sxtl2,   arm_neon_sxtl2,   arm_neon_sxtl2,  _UNDEF,            _UNDEF,           _UNDEF)
 SIMD_OP  (128, OP_XUNOP,       OP_ARM64_UXTL,        TDS,                arm_neon_uxtl,    arm_neon_uxtl,    arm_neon_uxtl,   _UNDEF,            _UNDEF,           _UNDEF)
 SIMD_OP  (128, OP_XUNOP,       OP_ARM64_UXTL2,       TDS,                arm_neon_uxtl2,   arm_neon_uxtl2,   arm_neon_uxtl2,  _UNDEF,            _UNDEF,           _UNDEF)
+SIMD_OP  (128, OP_XUNOP,       OP_CVT_FP_SI,         WTDS,               _UNDEF,           _UNDEF,           arm_neon_fcvtzs, arm_neon_fcvtzs,   _UNDEF,           _UNDEF)
+SIMD_OP  (128, OP_XUNOP,       OP_CVT_FP_UI,         WTDS,               _UNDEF,           _UNDEF,           arm_neon_fcvtzu, arm_neon_fcvtzu,   _UNDEF,           _UNDEF)
+SIMD_OP  (128, OP_XUNOP,       OP_CVT_SI_FP,         WTDS,               _UNDEF,           _UNDEF,           _UNDEF,          _UNDEF,            arm_neon_scvtf,   arm_neon_scvtf)
+SIMD_OP  (128, OP_XUNOP,       OP_CVT_UI_FP,         WTDS,               _UNDEF,           _UNDEF,           _UNDEF,          _UNDEF,            arm_neon_ucvtf,   arm_neon_ucvtf)
 SIMD_OP  (128, OP_XBINOP,      OP_IADD,              WTDSS,              arm_neon_add,     arm_neon_add,     arm_neon_add,    arm_neon_add,      _UNDEF,           _UNDEF)
 SIMD_OP  (128, OP_XBINOP,      OP_FADD,              WTDSS,              _UNDEF,           _UNDEF,           _UNDEF,          _UNDEF,            arm_neon_fadd,    arm_neon_fadd)
 SIMD_OP  (128, OP_XBINOP,      OP_ISUB,              WTDSS,              arm_neon_sub,     arm_neon_sub,     arm_neon_sub,    arm_neon_sub,      _UNDEF,           _UNDEF)
diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c
index 3ec68e1..29bdf03 100644
--- a/src/mono/mono/mini/simd-intrinsics.c
+++ b/src/mono/mono/mini/simd-intrinsics.c
@@ -1349,12 +1349,6 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
 		if (!(!strcmp (m_class_get_name (cmethod->klass), "Vector128") || !strcmp (m_class_get_name (cmethod->klass), "Vector")))
 			return NULL;
 		switch (id) {
-		case SN_ConvertToDouble:
-		case SN_ConvertToInt32:
-		case SN_ConvertToInt64:
-		case SN_ConvertToSingle:
-		case SN_ConvertToUInt32:
-		case SN_ConvertToUInt64:
 		case SN_Create:
 		case SN_GetLower:
 		case SN_GetUpper:
@@ -1484,9 +1478,16 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
 #endif
 	}
 	case SN_ConvertToDouble: {
-#if defined(TARGET_ARM64) || defined(TARGET_AMD64)
 		if ((arg0_type != MONO_TYPE_I8) && (arg0_type != MONO_TYPE_U8))
 			return NULL;
+#if defined(TARGET_ARM64)
+		if (!COMPILE_LLVM (cfg)) {
+			return emit_simd_ins_for_sig (cfg, klass, OP_XUNOP, 
+				arg0_type == MONO_TYPE_I8 ? OP_CVT_SI_FP : OP_CVT_UI_FP, 
+				MONO_TYPE_R8, fsig, args);
+		}
+#endif
+#if defined(TARGET_ARM64) || defined(TARGET_AMD64)
 		MonoClass *arg_class = mono_class_from_mono_type_internal (fsig->params [0]);
 		int size = mono_class_value_size (arg_class, NULL);
 		int op = -1;
@@ -1501,9 +1502,17 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
 	}
 	case SN_ConvertToInt32: 
 	case SN_ConvertToUInt32: {
-#if defined(TARGET_ARM64) || defined(TARGET_AMD64)
 		if (arg0_type != MONO_TYPE_R4)
 			return NULL;
+#if defined(TARGET_ARM64)
+		if (!COMPILE_LLVM (cfg)) {
+			return emit_simd_ins_for_sig (cfg, klass, OP_XUNOP, 
+				id == SN_ConvertToInt32 ? OP_CVT_FP_SI : OP_CVT_FP_UI, 
+				id == SN_ConvertToInt32 ? MONO_TYPE_I4 : MONO_TYPE_U4, 
+				fsig, args);
+		}
+#endif
+#if defined(TARGET_ARM64) || defined(TARGET_AMD64)
 		int op = id == SN_ConvertToInt32 ? OP_CVT_FP_SI : OP_CVT_FP_UI;
 		return emit_simd_ins_for_sig (cfg, klass, op, -1, arg0_type, fsig, args);
 #else
@@ -1512,9 +1521,17 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
 	}
 	case SN_ConvertToInt64:
 	case SN_ConvertToUInt64: {
-#if defined(TARGET_ARM64) || defined(TARGET_AMD64)
 		if (arg0_type != MONO_TYPE_R8)
 			return NULL;
+#if defined(TARGET_ARM64)
+		if (!COMPILE_LLVM (cfg)) {
+			return emit_simd_ins_for_sig (cfg, klass, OP_XUNOP, 
+				id == SN_ConvertToInt64 ? OP_CVT_FP_SI : OP_CVT_FP_UI, 
+				id == SN_ConvertToInt64 ? MONO_TYPE_I8 : MONO_TYPE_U8, 
+				fsig, args);
+		}
+#endif
+#if defined(TARGET_ARM64) || defined(TARGET_AMD64)
 		MonoClass *arg_class = mono_class_from_mono_type_internal (fsig->params [0]);
 		int size = mono_class_value_size (arg_class, NULL);
 		int op = -1;
@@ -1528,9 +1545,16 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
 #endif
 	}
 	case SN_ConvertToSingle: {
-#if defined(TARGET_ARM64) || defined(TARGET_AMD64)
 		if ((arg0_type != MONO_TYPE_I4) && (arg0_type != MONO_TYPE_U4))
 			return NULL;
+#if defined(TARGET_ARM64)
+		if (!COMPILE_LLVM (cfg)) {
+			return emit_simd_ins_for_sig (cfg, klass, OP_XUNOP, 
+				arg0_type == MONO_TYPE_I4 ? OP_CVT_SI_FP : OP_CVT_UI_FP, 
+				MONO_TYPE_R4, fsig, args);
+		}
+#endif
+#if defined(TARGET_ARM64) || defined(TARGET_AMD64)
 		int op = arg0_type == MONO_TYPE_I4 ? OP_CVT_SI_FP : OP_CVT_UI_FP;
 		return emit_simd_ins_for_sig (cfg, klass, op, -1, arg0_type, fsig, args);
 #else
diff --git a/src/tests/issues.targets b/src/tests/issues.targets
index c9d17f9..f0d8e1b 100644
--- a/src/tests/issues.targets
+++ b/src/tests/issues.targets
@@ -3333,6 +3333,12 @@
         <ExcludeList Include="$(XunitTestBinBase)/Exceptions/ForeignThread/ForeignThreadExceptions/**">
             <Issue>https://github.com/dotnet/runtime/issues/82859</Issue>
         </ExcludeList>
+        <ExcludeList Include="$(XunitTestBinBase)/JIT/SIMD/VectorConvert_r_Target_64Bit/**">
+            <Issue>https://github.com/dotnet/runtime/issues/85316</Issue>
+        </ExcludeList>
+        <ExcludeList Include="$(XunitTestBinBase)/JIT/SIMD/VectorConvert_ro_Target_64Bit/**">
+            <Issue>https://github.com/dotnet/runtime/issues/85316</Issue>
+        </ExcludeList>
     </ItemGroup>
 
     <ItemGroup Condition=" '$(RuntimeFlavor)' == 'mono' and '$(TargetArchitecture)' == 'arm64' and '$(TargetsWindows)' != 'true' " >
-- 
2.7.4