[Mono] Enable the supported V128 SIMD intrinsics on Arm64 across all codegen engines...
authorFan Yang <52458914+fanyang-mono@users.noreply.github.com>
Wed, 12 Apr 2023 12:50:49 +0000 (08:50 -0400)
committerGitHub <noreply@github.com>
Wed, 12 Apr 2023 12:50:49 +0000 (08:50 -0400)
* Enable the supported ones

* Add supporte for Create* and fix a bug

* Fix CreateScalar for floating types

* Fix create*

* Address review feedback

src/mono/mono/arch/arm64/arm64-codegen.h
src/mono/mono/mini/cpu-arm64.mdesc
src/mono/mono/mini/mini-arm64.c
src/mono/mono/mini/mini-ops.h
src/mono/mono/mini/simd-intrinsics.c

index 67c0864ed91bd74a1412a9bf40bc4c0165756cf7..49dc27294832a8b39a3591d0d55a1f73e972b60a 100644 (file)
@@ -1111,8 +1111,8 @@ arm_encode_arith_imm (int imm, guint32 *shift)
 /* NEON :: extract */
 #define arm_neon_extr_opcode(p, q, op2, imm4, rd, rn, rm) arm_neon_opcode_3reg ((p), (q), 0b00101110000000000000000000000000 | (op2) << 22 | (imm4) << 11, (rd), (rn), (rm))
 
-#define arm_neon_ext_8b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_LOW, 0b00, (index), (rd), (rn), (rd))
-#define arm_neon_ext_16b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_FULL, 0b00, (index), (rd), (rn), (rd))
+#define arm_neon_ext_8b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_LOW, 0b00, (index), (rd), (rn), (rm))
+#define arm_neon_ext_16b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_FULL, 0b00, (index), (rd), (rn), (rm))
 
 /* NEON :: copy */
 #define arm_neon_cpy_opcode(p, q, op, imm5, imm4, rd, rn) arm_neon_opcode_2reg ((p), (q), 0b00001110000000000000010000000000 | (op) << 29 | (imm5) << 16 | (imm4) << 11, (rd), (rn))
index 0e27f23291ad342f803619a60a46c61912ce073e..8da7174c0fd58cb5706a66bc8c04f7a2398874a5 100644 (file)
@@ -521,6 +521,16 @@ expand_i4: dest:x src1:i len:4
 expand_i8: dest:x src1:i len:4
 expand_r4: dest:x src1:f len:4
 expand_r8: dest:x src1:f len:4
+insert_i1: dest:x src1:i len:4
+insert_i2: dest:x src1:i len:4
+insert_i4: dest:x src1:i len:4
+insert_i8: dest:x src1:i len:4
+insert_r4: dest:x src1:f len:4
+insert_r8: dest:x src1:f len:4
+create_scalar_int: dest:x src1:i len:8
+create_scalar_float: dest:x src1:f len:12
+create_scalar_unsafe_int: dest:x src1:i len:4
+create_scalar_unsafe_float: dest:x src1:f len:4
 
 generic_class_init: src1:a len:44 clob:c
 gc_safe_point: src1:i len:12 clob:c
index e9e9ff6b0edcbeaea6d70e68886938496b9898dd..3d23bc27da4ba2c03ce9b4f6b2ee9902e257c895 100644 (file)
@@ -3717,6 +3717,48 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        }
                        break;
                }
+                       /* SIMD that is not table-generated */
+                       /* TODO: once https://github.com/dotnet/runtime/issues/83252 is done,
+                        * move the following two to the codegen table in simd-arm64.h
+                        */
+               case OP_ONES_COMPLEMENT:
+                       arm_neon_not (code, get_vector_size_macro (ins), dreg, sreg1);
+                       break;
+               case OP_NEGATION:
+                       if (is_type_float_macro (ins->inst_c1)) {
+                               arm_neon_fneg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
+                       } else {
+                               arm_neon_neg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
+                       }
+                       break;
+               case OP_XBINOP:
+                       switch (ins->inst_c0) {
+                       case OP_IMAX:
+                               code = emit_smax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
+                               break;
+                       case OP_IMAX_UN:
+                               code = emit_umax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
+                               break;
+                       case OP_IMIN:
+                               code = emit_smin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
+                               break;
+                       case OP_IMIN_UN:
+                               code = emit_umin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
+                               break;
+                       default:
+                               g_assert_not_reached ();
+                       }
+                       break;
+               case OP_XZERO:
+                       arm_neon_eor_16b (code, dreg, dreg, dreg);
+                       break;
+               case OP_XONES:
+                       arm_neon_eor_16b (code, dreg, dreg, dreg);
+                       arm_neon_not_16b (code, dreg, dreg);
+                       break;
+               case OP_XEXTRACT: 
+                       code = emit_xextract (code, VREG_FULL, ins->inst_c0, dreg, sreg1);
+                       break;
                case OP_STOREX_MEMBASE:
                        code = emit_strfpq (code, sreg1, dreg, ins->inst_offset);
                        break;
@@ -3730,10 +3772,10 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        if (cfg->compile_aot && cfg->code_exec_only) {
                                mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128_GOT, ins->inst_p0);
                                arm_ldrx_lit (code, ARMREG_IP0, 0);
-                               arm_ldrfpq (code, ins->dreg, ARMREG_IP0, 0);
+                               arm_ldrfpq (code, dreg, ARMREG_IP0, 0);
                        } else {
                                mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128, ins->inst_p0);
-                               arm_neon_ldrq_lit (code, ins->dreg, 0);
+                               arm_neon_ldrq_lit (code, dreg, 0);
                        }
                        break;
                }
@@ -3744,13 +3786,13 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                case OP_EXPAND_I4:
                case OP_EXPAND_I8: {
                        const int t = get_type_size_macro (ins->inst_c1);
-                       arm_neon_dup_g (code, VREG_FULL, t, ins->dreg, ins->sreg1);
+                       arm_neon_dup_g (code, VREG_FULL, t, dreg, sreg1);
                        break;
                }
                case OP_EXPAND_R4:
                case OP_EXPAND_R8: {
                        const int t = get_type_size_macro (ins->inst_c1);
-                       arm_neon_fdup_e (code, VREG_FULL, t, ins->dreg, ins->sreg1, 0);
+                       arm_neon_fdup_e (code, VREG_FULL, t, dreg, sreg1, 0);
                        break;
                }
                case OP_EXTRACT_I1:
@@ -3760,9 +3802,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        const int t = get_type_size_macro (ins->inst_c1);
                        // smov is not defined for i64
                        if (is_type_unsigned_macro (ins->inst_c1) || t == TYPE_I64) {
-                               arm_neon_umov (code, t, ins->dreg, ins->sreg1, ins->inst_c0);
+                               arm_neon_umov (code, t, dreg, sreg1, ins->inst_c0);
                        } else {
-                               arm_neon_smov (code, t, ins->dreg, ins->sreg1, ins->inst_c0);
+                               arm_neon_smov (code, t, dreg, sreg1, ins->inst_c0);
                        }
                        break;
                }
@@ -3773,17 +3815,39 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                                // Technically, this broadcasts element #inst_c0 to all dest XREG elements; whereas it should
                                // set the FREG to the said element. Since FREG and XREG pool is the same on arm64 and the rest
                                // of the F/XREG is ignored in FREG mode, this operation remains valid.
-                               arm_neon_fdup_e (code, VREG_FULL, t, ins->dreg, ins->sreg1, ins->inst_c0);
+                               arm_neon_fdup_e (code, VREG_FULL, t, dreg, sreg1, ins->inst_c0);
                        }
                        break;
+               case OP_INSERT_I1:
+               case OP_INSERT_I2:
+               case OP_INSERT_I4:
+               case OP_INSERT_I8: {
+                       const int t = get_type_size_macro (ins->inst_c1);
+                       arm_neon_ins_g(code, t, dreg, sreg1, ins->inst_c0);
+                       break;
+               }
+               case OP_INSERT_R4:
+               case OP_INSERT_R8: {
+                       int t = 0;
+                       switch (ins->inst_c1) {
+                       case MONO_TYPE_R4:
+                               t = SIZE_4;
+                               break;
+                       case MONO_TYPE_R8:
+                               t = SIZE_8;
+                               break;
+                       }
+                       arm_neon_ins_e(code, t, dreg, sreg1, ins->inst_c0, 0);
+                       break;
+               }
                case OP_ARM64_XADDV: {
                        switch (ins->inst_c0) {
                        case INTRINS_AARCH64_ADV_SIMD_FADDV:
                                if (ins->inst_c1 == MONO_TYPE_R8) {
-                                       arm_neon_faddp (code, VREG_FULL, TYPE_F64, ins->dreg, ins->sreg1, ins->sreg1);
+                                       arm_neon_faddp (code, VREG_FULL, TYPE_F64, dreg, sreg1, sreg1);
                                } else if (ins->inst_c1 == MONO_TYPE_R4) {
-                                       arm_neon_faddp (code, VREG_FULL, TYPE_F32, ins->dreg, ins->sreg1, ins->sreg1);
-                                       arm_neon_faddp (code, VREG_FULL, TYPE_F32, ins->dreg, ins->dreg, ins->dreg);
+                                       arm_neon_faddp (code, VREG_FULL, TYPE_F32, dreg, sreg1, sreg1);
+                                       arm_neon_faddp (code, VREG_FULL, TYPE_F32, dreg, dreg, dreg);
                                } else {
                                        g_assert_not_reached ();
                                } 
@@ -3792,7 +3856,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        case INTRINS_AARCH64_ADV_SIMD_UADDV:
                        case INTRINS_AARCH64_ADV_SIMD_SADDV: 
                                if (get_type_size_macro (ins->inst_c1) == TYPE_I64) 
-                                       arm_neon_addp (code, VREG_FULL, TYPE_I64, ins->dreg, ins->sreg1, ins->sreg1);
+                                       arm_neon_addp (code, VREG_FULL, TYPE_I64, dreg, sreg1, sreg1);
                                else
                                        g_assert_not_reached (); // remaining int types are handled through the codegen table
                                break;
@@ -3802,6 +3866,52 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        }
                        break;
                }
+               case OP_CREATE_SCALAR_INT: {
+                       const int t = get_type_size_macro (ins->inst_c1);
+                       arm_neon_eor_16b (code, dreg, dreg, dreg);
+                       arm_neon_ins_g(code, t, dreg, sreg1, 0);
+                       break;
+               }
+               case OP_CREATE_SCALAR_FLOAT: {
+                       int t = 0;
+                       switch (ins->inst_c1) {
+                       case MONO_TYPE_R4:
+                               t = SIZE_4;
+                               break;
+                       case MONO_TYPE_R8:
+                               t = SIZE_8;
+                               break;
+                       }
+                       // Use a temp register for zero op, as sreg1 and dreg share the same register here
+                       arm_neon_eor_16b (code, NEON_TMP_REG, NEON_TMP_REG, NEON_TMP_REG);
+                       arm_neon_ins_e(code, t, NEON_TMP_REG, sreg1, 0, 0);
+                       arm_neon_mov (code, dreg, NEON_TMP_REG);
+                       break;
+               }
+               case OP_CREATE_SCALAR_UNSAFE_INT: {
+                       const int t = get_type_size_macro (ins->inst_c1);
+                       arm_neon_ins_g(code, t, dreg, sreg1, 0);
+                       break;
+               }
+               case OP_CREATE_SCALAR_UNSAFE_FLOAT: {
+                       if (dreg != sreg1) {
+                               int t = 0;
+                               switch (ins->inst_c1) {
+                               case MONO_TYPE_R4:
+                                       t = SIZE_4;
+                                       break;
+                               case MONO_TYPE_R8:
+                                       t = SIZE_8;
+                                       break;
+                               }
+                               arm_neon_ins_e(code, t, dreg, sreg1, 0, 0);
+                       }
+                       break;
+               }
+               // Enable this when adding support for Narrow and enable support for Create at the same time
+               // case OP_XCONCAT:
+               //      arm_neon_ext_16b(code, dreg, sreg1, sreg2, 8);
+               //      break;
                
                        /* BRANCH */
                case OP_BR:
@@ -3875,49 +3985,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
                        arm_cbnzx (code, sreg1, 0);
                        break;
 
-                       /* SIMD that is not table-generated */
-                       /* TODO: once https://github.com/dotnet/runtime/issues/83252 is done,
-                        * move the following two to the codegen table in simd-arm64.h
-                        */
-               case OP_ONES_COMPLEMENT:
-                       arm_neon_not (code, get_vector_size_macro (ins), dreg, sreg1);
-                       break;
-               case OP_NEGATION:
-                       if (is_type_float_macro (ins->inst_c1)) {
-                               arm_neon_fneg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
-                       } else {
-                               arm_neon_neg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
-                       }
-                       break;
-               case OP_XBINOP:
-                       switch (ins->inst_c0) {
-                       case OP_IMAX:
-                               code = emit_smax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
-                               break;
-                       case OP_IMAX_UN:
-                               code = emit_umax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
-                               break;
-                       case OP_IMIN:
-                               code = emit_smin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
-                               break;
-                       case OP_IMIN_UN:
-                               code = emit_umin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
-                               break;
-                       default:
-                               g_assert_not_reached ();
-                       }
-                       break;
-               case OP_XZERO:
-                       arm_neon_eor_16b (code, dreg, dreg, dreg);
-                       break;
-               case OP_XONES:
-                       arm_neon_eor_16b (code, dreg, dreg, dreg);
-                       arm_neon_not_16b (code, dreg, dreg);
-                       break;
-               case OP_XEXTRACT: 
-                       code = emit_xextract (code, VREG_FULL, ins->inst_c0, dreg, sreg1);
-                       break;
-
                        /* ALU */
                case OP_IADD:
                        arm_addw (code, dreg, sreg1, sreg2);
index 18128181f54999d56c33981b57b523ef23668d93..4f45bc347d224b71f2dd292c15c6cbe87e9f78bf 100644 (file)
@@ -1168,6 +1168,11 @@ MINI_OP3(OP_MULX_HL64, "mulxhl64", LREG, LREG, LREG, LREG)
 MINI_OP(OP_CREATE_SCALAR_UNSAFE, "create_scalar_unsafe", XREG, XREG, NONE)
 MINI_OP(OP_CREATE_SCALAR, "create_scalar", XREG, XREG, NONE)
 
+MINI_OP(OP_CREATE_SCALAR_UNSAFE_INT, "create_scalar_unsafe_int", XREG, IREG, NONE)
+MINI_OP(OP_CREATE_SCALAR_UNSAFE_FLOAT, "create_scalar_unsafe_float", XREG, FREG, NONE)
+MINI_OP(OP_CREATE_SCALAR_INT, "create_scalar_int", XREG, IREG, NONE)
+MINI_OP(OP_CREATE_SCALAR_FLOAT, "create_scalar_float", XREG, FREG, NONE)
+
 MINI_OP(OP_XMOVE,   "xmove", XREG, XREG, NONE)
 MINI_OP(OP_XZERO,   "xzero", XREG, NONE, NONE)
 MINI_OP(OP_XONES,   "xones", XREG, NONE, NONE)
index 8f8d158f52e834b0fdf5b33d2a9e77cbe07893bc..7e89aba8d6981e6712d5dc27b4a33eacdd731d32 100644 (file)
@@ -843,9 +843,9 @@ type_to_expand_op (MonoTypeEnum type)
 }
 
 static int
-type_to_insert_op (MonoType *type)
+type_to_insert_op (MonoTypeEnum type)
 {
-       switch (type->type) {
+       switch (type) {
        case MONO_TYPE_I1:
        case MONO_TYPE_U1:
                return OP_INSERT_I1;
@@ -992,14 +992,15 @@ support_probe_complete:
 static MonoInst *
 emit_vector_create_elementwise (
        MonoCompile *cfg, MonoMethodSignature *fsig, MonoType *vtype,
-       MonoType *etype, MonoInst **args)
+       MonoTypeEnum type, MonoInst **args)
 {
-       int op = type_to_insert_op (etype);
+       int op = type_to_insert_op (type);
        MonoClass *vklass = mono_class_from_mono_type_internal (vtype);
        MonoInst *ins = emit_xzero (cfg, vklass);
        for (int i = 0; i < fsig->param_count; ++i) {
                ins = emit_simd_ins (cfg, vklass, op, ins->dreg, args [i]->dreg);
                ins->inst_c0 = i;
+               ins->inst_c1 = type;
        }
        return ins;
 }
@@ -1097,11 +1098,6 @@ static guint16 sri_vector_methods [] = {
        SN_AsUInt16,
        SN_AsUInt32,
        SN_AsUInt64,
-       SN_AsVector128,
-       SN_AsVector2,
-       SN_AsVector256,
-       SN_AsVector3,
-       SN_AsVector4,
        SN_BitwiseAnd,
        SN_BitwiseOr,
        SN_Ceiling,
@@ -1150,8 +1146,6 @@ static guint16 sri_vector_methods [] = {
        SN_ToScalar,
        SN_ToVector128,
        SN_ToVector128Unsafe,
-       SN_ToVector256,
-       SN_ToVector256Unsafe,
        SN_WidenLower,
        SN_WidenUpper,
        SN_WithElement,
@@ -1216,11 +1210,6 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
        if (!COMPILE_LLVM (cfg))
                return NULL;
 #endif
-// FIXME: This limitation could be removed once everything here are supported by mini JIT on arm64
-#ifdef TARGET_ARM64
-       if (!(cfg->compile_aot && cfg->full_aot && !cfg->interp))
-               return NULL;
-#endif
 
        int id = lookup_intrins (sri_vector_methods, sizeof (sri_vector_methods), cmethod);
        if (id == -1) {
@@ -1228,64 +1217,40 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
                return NULL;
        }
 
-       if (!strcmp (m_class_get_name (cfg->method->klass), "Vector256") || !strcmp (m_class_get_name (cfg->method->klass), "Vector512"))
+       if (!strcmp (m_class_get_name (cmethod->klass), "Vector256") || !strcmp (m_class_get_name (cmethod->klass), "Vector512"))
                return NULL; 
                
 // FIXME: This limitation could be removed once everything here are supported by mini JIT on arm64
 #ifdef TARGET_ARM64
        if (!COMPILE_LLVM (cfg)) {
+               if (!(!strcmp (m_class_get_name (cmethod->klass), "Vector128") || !strcmp (m_class_get_name (cmethod->klass), "Vector")))
+                       return NULL;
                switch (id) {
-               case SN_Add:
-               case SN_Equals:
-               case SN_GreaterThan:
-               case SN_GreaterThanOrEqual:
-               case SN_LessThan:
-               case SN_LessThanOrEqual:
-               case SN_Negate:
-               case SN_OnesComplement:
-               case SN_EqualsAny:
-               case SN_GreaterThanAny:
-               case SN_GreaterThanOrEqualAny:
-               case SN_LessThanAny:
-               case SN_LessThanOrEqualAny:
-               case SN_EqualsAll:
-               case SN_GreaterThanAll:
-               case SN_GreaterThanOrEqualAll:
-               case SN_LessThanAll:
-               case SN_LessThanOrEqualAll:
-               case SN_Subtract:
-               case SN_BitwiseAnd:
-               case SN_BitwiseOr:
-               case SN_Xor:
-               case SN_As:
-               case SN_AsByte:
-               case SN_AsDouble:
-               case SN_AsInt16:
-               case SN_AsInt32:
-               case SN_AsInt64:
-               case SN_AsSByte:
-               case SN_AsSingle:
-               case SN_AsUInt16:
-               case SN_AsUInt32:
-               case SN_AsUInt64:
-               case SN_Max:
-               case SN_Min:
-               case SN_Sum:
-               case SN_ToScalar:
-               case SN_Floor:
-               case SN_Ceiling:
-               case SN_Divide:
-               case SN_Multiply:
-               case SN_Sqrt:
-               case SN_Abs:
-                       break;
-               default: 
+               case SN_AndNot:
+               case SN_ConditionalSelect:
+               case SN_ConvertToDouble:
+               case SN_ConvertToInt32:
+               case SN_ConvertToInt64:
+               case SN_ConvertToSingle:
+               case SN_ConvertToUInt32:
+               case SN_ConvertToUInt64:
+               case SN_Create:
+               case SN_Dot:
+               case SN_ExtractMostSignificantBits:
+               case SN_GetElement:
+               case SN_GetLower:
+               case SN_GetUpper:
+               case SN_Narrow:
+               case SN_Shuffle:
+               case SN_ToVector128:
+               case SN_ToVector128Unsafe:
+               case SN_WidenLower:
+               case SN_WidenUpper:
+               case SN_WithElement:
                        return NULL;
+               default:
+                       break;
                }
-               MonoClass *arg0_class = mono_class_from_mono_type_internal (fsig->params [0]);
-               int class_size = mono_class_value_size (arg0_class, NULL);
-               if (class_size != 16)
-                       return NULL;
        }
 #endif
 
@@ -1462,25 +1427,44 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi
                MonoType *etype = get_vector_t_elem_type (fsig->ret);
                if (!MONO_TYPE_IS_VECTOR_PRIMITIVE (etype))
                        return NULL;
-               if (fsig->param_count == 1 && mono_metadata_type_equal (fsig->params [0], etype))
-                       return emit_simd_ins (cfg, klass, type_to_expand_op (etype->type), args [0]->dreg, -1);
-               else if (is_create_from_half_vectors_overload (fsig))
+               if (fsig->param_count == 1 && mono_metadata_type_equal (fsig->params [0], etype)) {
+                       MonoInst* ins = emit_simd_ins (cfg, klass, type_to_expand_op (etype->type), args [0]->dreg, -1);
+                       ins->inst_c1 = arg0_type;
+                       return ins;
+               } else if (is_create_from_half_vectors_overload (fsig))
                        return emit_simd_ins (cfg, klass, OP_XCONCAT, args [0]->dreg, args [1]->dreg);
                else if (is_elementwise_create_overload (fsig, etype))
-                       return emit_vector_create_elementwise (cfg, fsig, fsig->ret, etype, args);
+                       return emit_vector_create_elementwise (cfg, fsig, fsig->ret, arg0_type, args);
                break;
        }
        case SN_CreateScalar: {
                MonoType *etype = get_vector_t_elem_type (fsig->ret);
                if (!MONO_TYPE_IS_VECTOR_PRIMITIVE (etype))
                        return NULL;
-               return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR, -1, arg0_type, fsig, args);
+               if (COMPILE_LLVM (cfg))
+                       return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR, -1, arg0_type, fsig, args);
+               else {
+                       if (type_enum_is_float (arg0_type)) {
+                               return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR_FLOAT, -1, arg0_type, fsig, args);
+                       } else {
+                               return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR_INT, -1, arg0_type, fsig, args);
+                       }
+               }
+
        }
        case SN_CreateScalarUnsafe: {
                MonoType *etype = get_vector_t_elem_type (fsig->ret);
                if (!MONO_TYPE_IS_VECTOR_PRIMITIVE (etype))
                        return NULL;
-               return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR_UNSAFE, -1, arg0_type, fsig, args);
+               if (COMPILE_LLVM (cfg))
+                       return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR_UNSAFE, -1, arg0_type, fsig, args);
+               else {
+                       if (type_enum_is_float (arg0_type)) {
+                               return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR_UNSAFE_FLOAT, -1, arg0_type, fsig, args);
+                       } else {
+                               return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR_UNSAFE_INT, -1, arg0_type, fsig, args);
+                       }
+               }
        }
        case SN_Dot: {
                if (!is_element_type_primitive (fsig->params [0]))