[wasm] Add Vector128 and PackedSimd support to the jiterpreter; add PackedSimd to...
authorKatelyn Gadd <kg@luminance.org>
Sun, 7 May 2023 05:11:33 +0000 (22:11 -0700)
committerGitHub <noreply@github.com>
Sun, 7 May 2023 05:11:33 +0000 (22:11 -0700)
* Add PackedSIMD support to the interpreter (off by default)
* Add SIMD support to the jiterpreter
* Add runtime options governing interpreter vector128 and packedsimd support
* Add some R4 vector128 operations to the interpreter
* Fix jiterpreter MINT_POPCNT_I8 implementation
* Enable compiling the runtime with wasm simd support so that intrinsics can be used
* Add browser-bench measurements for packing vector128

23 files changed:
src/mono/CMakeLists.txt
src/mono/mono/mini/interp/interp-internals.h
src/mono/mono/mini/interp/interp-simd-intrins.def
src/mono/mono/mini/interp/interp-simd.c
src/mono/mono/mini/interp/interp-simd.h
src/mono/mono/mini/interp/interp.c
src/mono/mono/mini/interp/mintops.h
src/mono/mono/mini/interp/simd-methods.def
src/mono/mono/mini/interp/transform-simd.c
src/mono/mono/utils/options-def.h
src/mono/sample/wasm/browser-bench/Vector.cs
src/mono/wasm/runtime/CMakeLists.txt
src/mono/wasm/runtime/cwraps.ts
src/mono/wasm/runtime/genmintops.py
src/mono/wasm/runtime/jiterpreter-interp-entry.ts
src/mono/wasm/runtime/jiterpreter-jit-call.ts
src/mono/wasm/runtime/jiterpreter-opcodes.ts
src/mono/wasm/runtime/jiterpreter-support.ts
src/mono/wasm/runtime/jiterpreter-trace-generator.ts
src/mono/wasm/runtime/jiterpreter.ts
src/mono/wasm/runtime/wasm-simd-feature-detect.wasm [new file with mode: 0644]
src/mono/wasm/runtime/wasm-simd-feature-detect.wat [new file with mode: 0644]
src/mono/wasm/wasm.proj

index 92a0ac8..5ff644b 100644 (file)
@@ -270,6 +270,7 @@ elseif(CLR_CMAKE_HOST_OS STREQUAL "emscripten")
   add_compile_options(-Wno-strict-prototypes)
   add_compile_options(-Wno-unused-but-set-variable)
   add_compile_options(-Wno-single-bit-bitfield-constant-conversion)
+  add_compile_options(-msimd128)
   set(DISABLE_EXECUTABLES 1)
   # FIXME: Is there a cmake option for this ?
   set(DISABLE_SHARED_LIBS 1)
index a2bff18..dc38222 100644 (file)
@@ -102,7 +102,7 @@ typedef enum {
 
 #define PROFILE_INTERP 0
 
-#if !HOST_BROWSER && __GNUC__
+#if __GNUC__
 #define INTERP_ENABLE_SIMD
 #endif
 
@@ -342,6 +342,12 @@ mono_jiterp_stackval_from_data (MonoType *type, stackval *result, const void *da
 gpointer
 mono_jiterp_frame_data_allocator_alloc (FrameDataAllocator *stack, InterpFrame *frame, int size);
 
+gpointer
+mono_jiterp_get_simd_intrinsic (int arity, int index);
+
+int
+mono_jiterp_get_simd_opcode (int arity, int index);
+
 #endif
 
 static inline int
index 57bbba1..9ed37a3 100644 (file)
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_ADD, interp_v128_i1_op_addition)
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_ADD, interp_v128_i2_op_addition)
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_ADD, interp_v128_i4_op_addition)
-
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_SUB, interp_v128_i1_op_subtraction)
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_SUB, interp_v128_i2_op_subtraction)
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_SUB, interp_v128_i4_op_subtraction)
-
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_AND, interp_v128_op_bitwise_and)
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_OR, interp_v128_op_bitwise_or)
-
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_EQUALITY, interp_v128_op_bitwise_equality)
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_INEQUALITY, interp_v128_op_bitwise_inequality)
-
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_EXCLUSIVE_OR, interp_v128_op_exclusive_or)
-
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_MULTIPLY, interp_v128_i1_op_multiply)
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_MULTIPLY, interp_v128_i2_op_multiply)
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_MULTIPLY, interp_v128_i4_op_multiply)
-
-INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_NEGATION, interp_v128_i1_op_negation)
-INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_NEGATION, interp_v128_i2_op_negation)
-INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_NEGATION, interp_v128_i4_op_negation)
-
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_LEFT_SHIFT, interp_v128_i1_op_left_shift)
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_LEFT_SHIFT, interp_v128_i2_op_left_shift)
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_LEFT_SHIFT, interp_v128_i4_op_left_shift)
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_LEFT_SHIFT, interp_v128_i8_op_left_shift)
-
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_RIGHT_SHIFT, interp_v128_i1_op_right_shift)
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_RIGHT_SHIFT, interp_v128_i2_op_right_shift)
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_RIGHT_SHIFT, interp_v128_i4_op_right_shift)
-
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_URIGHT_SHIFT, interp_v128_i1_op_uright_shift)
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_URIGHT_SHIFT, interp_v128_i2_op_uright_shift)
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_URIGHT_SHIFT, interp_v128_i4_op_uright_shift)
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_URIGHT_SHIFT, interp_v128_i8_op_uright_shift)
-
-INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_ONES_COMPLEMENT, interp_v128_op_ones_complement)
-
-INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_U2_WIDEN_LOWER, interp_v128_u2_widen_lower)
-INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_U2_WIDEN_UPPER, interp_v128_u2_widen_upper)
-
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U1_NARROW, interp_v128_u1_narrow)
-
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U1_GREATER_THAN, interp_v128_u1_greater_than)
-
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_LESS_THAN, interp_v128_i1_less_than)
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U1_LESS_THAN, interp_v128_u1_less_than)
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_LESS_THAN, interp_v128_i2_less_than)
-
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_EQUALS, interp_v128_i1_equals)
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_EQUALS, interp_v128_i2_equals)
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_EQUALS, interp_v128_i4_equals)
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_EQUALS, interp_v128_i8_equals)
-
-INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_CREATE_SCALAR, interp_v128_i1_create_scalar)
-INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_CREATE_SCALAR, interp_v128_i2_create_scalar)
-INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_CREATE_SCALAR, interp_v128_i4_create_scalar)
-INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I8_CREATE_SCALAR, interp_v128_i8_create_scalar)
-
-INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_EXTRACT_MSB, interp_v128_i1_extract_msb)
-INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_EXTRACT_MSB, interp_v128_i2_extract_msb)
-INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_EXTRACT_MSB, interp_v128_i4_extract_msb)
-INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I8_EXTRACT_MSB, interp_v128_i8_extract_msb)
-
-INTERP_SIMD_INTRINSIC_P_PPP (INTERP_SIMD_INTRINSIC_V128_CONDITIONAL_SELECT, interp_v128_conditional_select)
-
-INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_CREATE, interp_v128_i1_create)
-INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_CREATE, interp_v128_i2_create)
-INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_CREATE, interp_v128_i4_create)
-INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I8_CREATE, interp_v128_i8_create)
-
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_AND_NOT, interp_v128_and_not)
-
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U2_LESS_THAN_EQUAL, interp_v128_u2_less_than_equal)
-
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_SHUFFLE, interp_v128_i1_shuffle)
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_SHUFFLE, interp_v128_i2_shuffle)
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_SHUFFLE, interp_v128_i4_shuffle)
-INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_SHUFFLE, interp_v128_i8_shuffle)
+// FIXME: SIMD causes compile errors on WASI
+#ifdef HOST_BROWSER
+#ifndef INTERP_WASM_SIMD_INTRINSIC_V_P
+#define INTERP_WASM_SIMD_INTRINSIC_V_P(id, c_intrinsic, wasm_opcode) INTERP_SIMD_INTRINSIC_P_P(id, _mono_interp_simd_ ## id, wasm_opcode)
+#endif
+#ifndef INTERP_WASM_SIMD_INTRINSIC_V_V
+#define INTERP_WASM_SIMD_INTRINSIC_V_V(id, c_intrinsic, wasm_opcode) INTERP_SIMD_INTRINSIC_P_P(id, _mono_interp_simd_ ## id, wasm_opcode)
+#endif
+#ifndef INTERP_WASM_SIMD_INTRINSIC_I_V
+#define INTERP_WASM_SIMD_INTRINSIC_I_V(id, c_intrinsic, wasm_opcode) INTERP_SIMD_INTRINSIC_P_P(id, _mono_interp_simd_ ## id, wasm_opcode)
+#endif
+#ifndef INTERP_WASM_SIMD_INTRINSIC_V_VV
+#define INTERP_WASM_SIMD_INTRINSIC_V_VV(id, c_intrinsic, wasm_opcode) INTERP_SIMD_INTRINSIC_P_PP(id, _mono_interp_simd_ ## id, wasm_opcode)
+#endif
+#ifndef INTERP_WASM_SIMD_INTRINSIC_V_VI
+#define INTERP_WASM_SIMD_INTRINSIC_V_VI(id, c_intrinsic, wasm_opcode) INTERP_SIMD_INTRINSIC_P_PP(id, _mono_interp_simd_ ## id, wasm_opcode)
+#endif
+#ifndef INTERP_WASM_SIMD_INTRINSIC_V_VVV
+#define INTERP_WASM_SIMD_INTRINSIC_V_VVV(id, c_intrinsic, wasm_opcode) INTERP_SIMD_INTRINSIC_P_PPP(id, _mono_interp_simd_ ## id, wasm_opcode)
+#endif
+#else // HOST_BROWSER
+#define INTERP_WASM_SIMD_INTRINSIC_V_P(id, c_intrinsic, wasm_opcode)
+#define INTERP_WASM_SIMD_INTRINSIC_V_V(id, c_intrinsic, wasm_opcode)
+#define INTERP_WASM_SIMD_INTRINSIC_I_V(id, c_intrinsic, wasm_opcode)
+#define INTERP_WASM_SIMD_INTRINSIC_V_VV(id, c_intrinsic, wasm_opcode)
+#define INTERP_WASM_SIMD_INTRINSIC_V_VI(id, c_intrinsic, wasm_opcode)
+#define INTERP_WASM_SIMD_INTRINSIC_V_VVV(id, c_intrinsic, wasm_opcode)
+#endif // HOST_BROWSER
+
+// The third argument is the wasm opcode that corresponds to this simd intrinsic, if any.
+// Specify 0 if there is no exact 1:1 mapping (the opcode can still be implemented manually in the jiterpreter.)
+
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_ADD, interp_v128_i1_op_addition, 110)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_ADD, interp_v128_i2_op_addition, 142)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_ADD, interp_v128_i4_op_addition, 174)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_R4_ADD, interp_v128_r4_op_addition, 228)
+
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_SUB, interp_v128_i1_op_subtraction, 113)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_SUB, interp_v128_i2_op_subtraction, 145)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_SUB, interp_v128_i4_op_subtraction, 177)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_R4_SUB, interp_v128_r4_op_subtraction, 229)
+
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_AND, interp_v128_op_bitwise_and, 78)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_OR, interp_v128_op_bitwise_or, 80)
+
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_EQUALITY, interp_v128_op_bitwise_equality, 0)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_BITWISE_INEQUALITY, interp_v128_op_bitwise_inequality, 0)
+
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_EXCLUSIVE_OR, interp_v128_op_exclusive_or, 81)
+
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_MULTIPLY, interp_v128_i1_op_multiply, 0)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_MULTIPLY, interp_v128_i2_op_multiply, 149)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_MULTIPLY, interp_v128_i4_op_multiply, 181)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_R4_MULTIPLY, interp_v128_r4_op_multiply, 230)
+
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_R4_DIVISION, interp_v128_r4_op_division, 231)
+
+INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_NEGATION, interp_v128_i1_op_negation, 97)
+INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_NEGATION, interp_v128_i2_op_negation, 129)
+INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_NEGATION, interp_v128_i4_op_negation, 161)
+
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_LEFT_SHIFT, interp_v128_i1_op_left_shift, 107)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_LEFT_SHIFT, interp_v128_i2_op_left_shift, 139)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_LEFT_SHIFT, interp_v128_i4_op_left_shift, 171)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_LEFT_SHIFT, interp_v128_i8_op_left_shift, 203)
+
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_RIGHT_SHIFT, interp_v128_i1_op_right_shift, 108)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_RIGHT_SHIFT, interp_v128_i2_op_right_shift, 140)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_RIGHT_SHIFT, interp_v128_i4_op_right_shift, 172)
+
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_URIGHT_SHIFT, interp_v128_i1_op_uright_shift, 109)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_URIGHT_SHIFT, interp_v128_i2_op_uright_shift, 141)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_URIGHT_SHIFT, interp_v128_i4_op_uright_shift, 173)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_URIGHT_SHIFT, interp_v128_i8_op_uright_shift, 205)
+
+INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_ONES_COMPLEMENT, interp_v128_op_ones_complement, 77)
+
+INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_U2_WIDEN_LOWER, interp_v128_u2_widen_lower, 137)
+INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_U2_WIDEN_UPPER, interp_v128_u2_widen_upper, 138)
+
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U1_NARROW, interp_v128_u1_narrow, 102)
+
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U1_GREATER_THAN, interp_v128_u1_greater_than, 40)
+
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_LESS_THAN, interp_v128_i1_less_than, 37)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U1_LESS_THAN, interp_v128_u1_less_than, 38)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_LESS_THAN, interp_v128_i2_less_than, 47)
+
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_EQUALS, interp_v128_i1_equals, 35)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_EQUALS, interp_v128_i2_equals, 45)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_EQUALS, interp_v128_i4_equals, 55)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_EQUALS, interp_v128_i8_equals, 214)
+
+INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_CREATE_SCALAR, interp_v128_i1_create_scalar, 0)
+INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_CREATE_SCALAR, interp_v128_i2_create_scalar, 0)
+INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_CREATE_SCALAR, interp_v128_i4_create_scalar, 0)
+INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I8_CREATE_SCALAR, interp_v128_i8_create_scalar, 0)
+
+INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_EXTRACT_MSB, interp_v128_i1_extract_msb, 0)
+INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_EXTRACT_MSB, interp_v128_i2_extract_msb, 0)
+INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_EXTRACT_MSB, interp_v128_i4_extract_msb, 0)
+INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I8_EXTRACT_MSB, interp_v128_i8_extract_msb, 0)
+
+// wasm opcode is 0 because it has a different calling convention
+INTERP_SIMD_INTRINSIC_P_PPP (INTERP_SIMD_INTRINSIC_V128_CONDITIONAL_SELECT, interp_v128_conditional_select, 0)
+
+INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I1_CREATE, interp_v128_i1_create, 0)
+INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I2_CREATE, interp_v128_i2_create, 0)
+INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I4_CREATE, interp_v128_i4_create, 0)
+INTERP_SIMD_INTRINSIC_P_P (INTERP_SIMD_INTRINSIC_V128_I8_CREATE, interp_v128_i8_create, 0)
+
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_AND_NOT, interp_v128_and_not, 79)
+
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_U2_LESS_THAN_EQUAL, interp_v128_u2_less_than_equal, 52)
+
+// wasm only has a swizzle opcode for i8x16, none of the others
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I1_SHUFFLE, interp_v128_i1_shuffle, 14)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I2_SHUFFLE, interp_v128_i2_shuffle, 0)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I4_SHUFFLE, interp_v128_i4_shuffle, 0)
+INTERP_SIMD_INTRINSIC_P_PP (INTERP_SIMD_INTRINSIC_V128_I8_SHUFFLE, interp_v128_i8_shuffle, 0)
+
+// Wasm PackedSimd (see PackedSimd.cs)
+// We automatically generate C wrappers around clang's wasm simd intrinsics for each of these intrinsics
+// The 2nd argument is the name of the clang intrinsic and the 3rd argument is the wasm opcode.
+
+INTERP_WASM_SIMD_INTRINSIC_V_P (INTERP_SIMD_INTRINSIC_WASM_I8X16_SPLAT, wasm_v128_load8_splat, 0x07)
+INTERP_WASM_SIMD_INTRINSIC_V_P (INTERP_SIMD_INTRINSIC_WASM_I16X8_SPLAT, wasm_v128_load16_splat, 0x08)
+INTERP_WASM_SIMD_INTRINSIC_V_P (INTERP_SIMD_INTRINSIC_WASM_I32X4_SPLAT, wasm_v128_load32_splat, 0x09)
+INTERP_WASM_SIMD_INTRINSIC_V_P (INTERP_SIMD_INTRINSIC_WASM_I64X2_SPLAT, wasm_v128_load64_splat, 0x0a)
+// FIXME: ExtractLane and ReplaceLane
+// FIXME: Shuffle
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_SWIZZLE, wasm_i8x16_swizzle, 0x0e)
+// FIXME: f32/f64 versions of add/subtract/multiply/negate are missing
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_ADD, wasm_i8x16_add, 0x6e)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I16X8_ADD, wasm_i16x8_add, 0x8e)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I32X4_ADD, wasm_i32x4_add, 0xae)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I64X2_ADD, wasm_i64x2_add, 0xce)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_SUBTRACT, wasm_i8x16_sub, 0x71)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I16X8_SUBTRACT, wasm_i16x8_sub, 0x91)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I32X4_SUBTRACT, wasm_i32x4_sub, 0xb1)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I64X2_SUBTRACT, wasm_i64x2_sub, 0xd1)
+// There is no i8x16 mul opcode
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_MULTIPLY, _interp_wasm_simd_assert_not_reached, 0x0)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I16X8_MULTIPLY, wasm_i16x8_mul, 0x95)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I32X4_MULTIPLY, wasm_i32x4_mul, 0xb5)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I64X2_MULTIPLY, wasm_i64x2_mul, 0xd5)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I32X4_DOT_I16X8, wasm_i32x4_dot_i16x8, 0xba)
+INTERP_WASM_SIMD_INTRINSIC_V_V  (INTERP_SIMD_INTRINSIC_WASM_I8X16_NEGATE, wasm_i8x16_neg, 0x61)
+INTERP_WASM_SIMD_INTRINSIC_V_V  (INTERP_SIMD_INTRINSIC_WASM_I16X8_NEGATE, wasm_i16x8_neg, 0x81)
+INTERP_WASM_SIMD_INTRINSIC_V_V  (INTERP_SIMD_INTRINSIC_WASM_I32X4_NEGATE, wasm_i32x4_neg, 0xa1)
+INTERP_WASM_SIMD_INTRINSIC_V_V  (INTERP_SIMD_INTRINSIC_WASM_I64X2_NEGATE, wasm_i64x2_neg, 0xc1)
+INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I8X16_SHIFTLEFT, wasm_i8x16_shl, 0x6b)
+INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I16X8_SHIFTLEFT, wasm_i16x8_shl, 0x8b)
+INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I32X4_SHIFTLEFT, wasm_i32x4_shl, 0xab)
+INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I64X2_SHIFTLEFT, wasm_i64x2_shl, 0xcb)
+INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I8X16_SHIFTRIGHTARITHMETIC, wasm_i8x16_shr, 0x6c)
+INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I16X8_SHIFTRIGHTARITHMETIC, wasm_i16x8_shr, 0x8c)
+INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I32X4_SHIFTRIGHTARITHMETIC, wasm_i32x4_shr, 0xac)
+INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I64X2_SHIFTRIGHTARITHMETIC, wasm_i64x2_shr, 0xcc)
+INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I8X16_SHIFTRIGHTLOGICAL, wasm_u8x16_shr, 0x6d)
+INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I16X8_SHIFTRIGHTLOGICAL, wasm_u16x8_shr, 0x8d)
+INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I32X4_SHIFTRIGHTLOGICAL, wasm_u32x4_shr, 0xad)
+INTERP_WASM_SIMD_INTRINSIC_V_VI (INTERP_SIMD_INTRINSIC_WASM_I64X2_SHIFTRIGHTLOGICAL, wasm_u64x2_shr, 0xcd)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_AND, wasm_v128_and, 0x4e)
+// FIXME: NOT, OR, XOR
+INTERP_WASM_SIMD_INTRINSIC_I_V  (INTERP_SIMD_INTRINSIC_WASM_I8X16_BITMASK, wasm_i8x16_bitmask, 0x64)
+INTERP_WASM_SIMD_INTRINSIC_I_V  (INTERP_SIMD_INTRINSIC_WASM_I16X8_BITMASK, wasm_i16x8_bitmask, 0x84)
+INTERP_WASM_SIMD_INTRINSIC_I_V  (INTERP_SIMD_INTRINSIC_WASM_I32X4_BITMASK, wasm_i32x4_bitmask, 0xa4)
+INTERP_WASM_SIMD_INTRINSIC_I_V  (INTERP_SIMD_INTRINSIC_WASM_I64X2_BITMASK, wasm_i64x2_bitmask, 0xc4)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_COMPAREEQUAL, wasm_i8x16_eq, 0x23)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I16X8_COMPAREEQUAL, wasm_i16x8_eq, 0x2d)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I32X4_COMPAREEQUAL, wasm_i32x4_eq, 0x37)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I64X2_COMPAREEQUAL, wasm_i64x2_eq, 0xd6)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_F32X4_COMPAREEQUAL, wasm_f32x4_eq, 0x41)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_F64X2_COMPAREEQUAL, wasm_f64x2_eq, 0x47)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_COMPARENOTEQUAL, wasm_i8x16_ne, 0x24)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I16X8_COMPARENOTEQUAL, wasm_i16x8_ne, 0x2e)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I32X4_COMPARENOTEQUAL, wasm_i32x4_ne, 0x38)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I64X2_COMPARENOTEQUAL, wasm_i64x2_ne, 0xd7)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_F32X4_COMPARENOTEQUAL, wasm_f32x4_ne, 0x42)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_F64X2_COMPARENOTEQUAL, wasm_f64x2_ne, 0x48)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_NARROW_I16X8_S, wasm_i8x16_narrow_i16x8, 0x65)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I16X8_NARROW_I32X4_S, wasm_i16x8_narrow_i32x4, 0x85)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_NARROW_I16X8_U, wasm_u8x16_narrow_i16x8, 0x66)
+INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I16X8_NARROW_I32X4_U, wasm_u16x8_narrow_i32x4, 0x86)
index f67370e..09e90a9 100644 (file)
@@ -2,6 +2,10 @@
 #include "interp-internals.h"
 #include "interp-simd.h"
 
+#if HOST_BROWSER
+#include <wasm_simd128.h>
+#endif
+
 #ifdef INTERP_ENABLE_SIMD
 
 typedef gint64 v128_i8 __attribute__ ((vector_size (SIZEOF_V128)));
@@ -12,6 +16,7 @@ typedef gint16 v128_i2 __attribute__ ((vector_size (SIZEOF_V128)));
 typedef guint16 v128_u2 __attribute__ ((vector_size (SIZEOF_V128)));
 typedef gint8 v128_i1 __attribute__ ((vector_size (SIZEOF_V128)));
 typedef guint8 v128_u1 __attribute__ ((vector_size (SIZEOF_V128)));
+typedef float v128_r4 __attribute__ ((vector_size (SIZEOF_V128)));
 
 // get_AllBitsSet
 static void
@@ -39,6 +44,12 @@ interp_v128_i4_op_addition (gpointer res, gpointer v1, gpointer v2)
        *(v128_i4*)res = *(v128_i4*)v1 + *(v128_i4*)v2;
 }
 
+static void
+interp_v128_r4_op_addition (gpointer res, gpointer v1, gpointer v2)
+{
+       *(v128_r4*)res = *(v128_r4*)v1 + *(v128_r4*)v2;
+}
+
 // op_Subtraction
 static void
 interp_v128_i1_op_subtraction (gpointer res, gpointer v1, gpointer v2)
@@ -58,6 +69,12 @@ interp_v128_i4_op_subtraction (gpointer res, gpointer v1, gpointer v2)
        *(v128_i4*)res = *(v128_i4*)v1 - *(v128_i4*)v2;
 }
 
+static void
+interp_v128_r4_op_subtraction (gpointer res, gpointer v1, gpointer v2)
+{
+       *(v128_r4*)res = *(v128_r4*)v1 - *(v128_r4*)v2;
+}
+
 // op_BitwiseAnd
 static void
 interp_v128_op_bitwise_and (gpointer res, gpointer v1, gpointer v2)
@@ -124,6 +141,18 @@ interp_v128_i4_op_multiply (gpointer res, gpointer v1, gpointer v2)
        *(v128_i4*)res = *(v128_i4*)v1 * *(v128_i4*)v2;
 }
 
+static void
+interp_v128_r4_op_multiply (gpointer res, gpointer v1, gpointer v2)
+{
+       *(v128_r4*)res = *(v128_r4*)v1 * *(v128_r4*)v2;
+}
+
+static void
+interp_v128_r4_op_division (gpointer res, gpointer v1, gpointer v2)
+{
+       *(v128_r4*)res = *(v128_r4*)v1 / *(v128_r4*)v2;
+}
+
 // op_UnaryNegation
 static void
 interp_v128_i1_op_negation (gpointer res, gpointer v1)
@@ -535,32 +564,122 @@ interp_v128_i8_shuffle (gpointer res, gpointer v1, gpointer v2)
        V128_SHUFFLE (gint64, guint64);
 }
 
-#define INTERP_SIMD_INTRINSIC_P_P(a,b)
-#define INTERP_SIMD_INTRINSIC_P_PP(a,b)
-#define INTERP_SIMD_INTRINSIC_P_PPP(a,b)
+#define INTERP_SIMD_INTRINSIC_P_P(a,b,c)
+#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c)
+#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c)
+
+// For the wasm packed simd intrinsics we want to automatically generate the C implementations from
+//  their corresponding clang intrinsics. See also:
+// https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/wasm_simd128.h
+// In this context V means Vector128 and P means void* pointer.
+#ifdef HOST_BROWSER
+
+static v128_t
+_interp_wasm_simd_assert_not_reached (v128_t lhs, v128_t rhs) {
+       g_assert_not_reached ();
+}
+
+#define INTERP_WASM_SIMD_INTRINSIC_V_P(id, c_intrinsic, wasm_opcode) \
+static void \
+_mono_interp_simd_ ## id (gpointer res, gpointer v1) { \
+       *((v128_t *)res) = c_intrinsic (v1); \
+}
+
+#define INTERP_WASM_SIMD_INTRINSIC_V_V(id, c_intrinsic, wasm_opcode) \
+static void \
+_mono_interp_simd_ ## id (gpointer res, gpointer v1) { \
+       *((v128_t *)res) = c_intrinsic (*((v128_t *)v1)); \
+}
+
+#define INTERP_WASM_SIMD_INTRINSIC_I_V(id, c_intrinsic, wasm_opcode) \
+static void \
+_mono_interp_simd_ ## id (gpointer res, gpointer v1) { \
+       *((int32_t *)res) = c_intrinsic (*((v128_t *)v1)); \
+}
+
+#define INTERP_WASM_SIMD_INTRINSIC_V_VV(id, c_intrinsic, wasm_opcode) \
+static void \
+_mono_interp_simd_ ## id (gpointer res, gpointer v1, gpointer v2) { \
+       *((v128_t *)res) = c_intrinsic (*((v128_t *)v1), *((v128_t *)v2)); \
+}
+
+#define INTERP_WASM_SIMD_INTRINSIC_V_VI(id, c_intrinsic, wasm_opcode) \
+static void \
+_mono_interp_simd_ ## id (gpointer res, gpointer v1, gpointer v2) { \
+       *((v128_t *)res) = c_intrinsic (*((v128_t *)v1), *((int *)v2)); \
+}
+
+#define INTERP_WASM_SIMD_INTRINSIC_V_VVV(id, c_intrinsic, wasm_opcode) \
+static void \
+_mono_interp_simd_ ## id (gpointer res, gpointer v1, gpointer v2, gpointer v3) { \
+       *((v128_t *)res) = c_intrinsic (*((v128_t *)v1), *((v128_t *)v2), *((v128_t *)v3)); \
+}
+
+#include "interp-simd-intrins.def"
+
+#undef INTERP_WASM_SIMD_INTRINSIC_V_P
+#undef INTERP_WASM_SIMD_INTRINSIC_V_V
+#undef INTERP_WASM_SIMD_INTRINSIC_I_V
+#undef INTERP_WASM_SIMD_INTRINSIC_V_VV
+#undef INTERP_WASM_SIMD_INTRINSIC_V_VI
+#undef INTERP_WASM_SIMD_INTRINSIC_V_VVV
+
+// Now generate the wasm opcode tables for the intrinsics
+
+#undef INTERP_SIMD_INTRINSIC_P_P
+#define INTERP_SIMD_INTRINSIC_P_P(a,b,c) c,
+
+int interp_simd_p_p_wasm_opcode_table [] = {
+#include "interp-simd-intrins.def"
+};
+
+#undef INTERP_SIMD_INTRINSIC_P_P
+#define INTERP_SIMD_INTRINSIC_P_P(a,b,c)
+
+#undef INTERP_SIMD_INTRINSIC_P_PP
+#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c) c,
+
+int interp_simd_p_pp_wasm_opcode_table [] = {
+#include "interp-simd-intrins.def"
+};
+
+#undef INTERP_SIMD_INTRINSIC_P_PP
+#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c)
+
+#undef INTERP_SIMD_INTRINSIC_P_PPP
+#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c) c,
+
+int interp_simd_p_ppp_wasm_opcode_table [] = {
+#include "interp-simd-intrins.def"
+};
+
+#undef INTERP_SIMD_INTRINSIC_P_PPP
+#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c)
+
+#endif // HOST_BROWSER
 
 #undef INTERP_SIMD_INTRINSIC_P_P
-#define INTERP_SIMD_INTRINSIC_P_P(a,b) b,
+#define INTERP_SIMD_INTRINSIC_P_P(a,b,c) b,
 PP_SIMD_Method interp_simd_p_p_table [] = {
 #include "interp-simd-intrins.def"
 };
 #undef INTERP_SIMD_INTRINSIC_P_P
-#define INTERP_SIMD_INTRINSIC_P_P(a,b)
+#define INTERP_SIMD_INTRINSIC_P_P(a,b,c)
 
 #undef INTERP_SIMD_INTRINSIC_P_PP
-#define INTERP_SIMD_INTRINSIC_P_PP(a,b) b,
+#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c) b,
 PPP_SIMD_Method interp_simd_p_pp_table [] = {
 #include "interp-simd-intrins.def"
 };
 #undef INTERP_SIMD_INTRINSIC_P_PP
-#define INTERP_SIMD_INTRINSIC_P_PP(a,b)
+#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c)
 
 #undef INTERP_SIMD_INTRINSIC_P_PPP
-#define INTERP_SIMD_INTRINSIC_P_PPP(a,b) b,
+#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c) b,
 PPPP_SIMD_Method interp_simd_p_ppp_table [] = {
 #include "interp-simd-intrins.def"
 };
 #undef INTERP_SIMD_INTRINSIC_P_PPP
-#define INTERP_SIMD_INTRINSIC_P_PPP(a,b)
+#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c)
 
 #endif // INTERP_ENABLE_SIMD
index 3763c57..e3306a2 100644 (file)
@@ -11,6 +11,12 @@ extern PP_SIMD_Method interp_simd_p_p_table [];
 extern PPP_SIMD_Method interp_simd_p_pp_table [];
 extern PPPP_SIMD_Method interp_simd_p_ppp_table [];
 
+#if HOST_BROWSER
+extern int interp_simd_p_p_wasm_opcode_table [];
+extern int interp_simd_p_pp_wasm_opcode_table [];
+extern int interp_simd_p_ppp_wasm_opcode_table [];
+#endif
+
 #endif /* __MONO_MINI_INTERP_SIMD_H__ */
 
 
index f3802f7..9e1e9e1 100644 (file)
@@ -8907,4 +8907,42 @@ mono_jiterp_enum_hasflag (MonoClass *klass, gint32 *dest, stackval *sp1, stackva
        *dest = mono_interp_enum_hasflag (sp1, sp2, klass);
 }
 
+EMSCRIPTEN_KEEPALIVE gpointer
+mono_jiterp_get_simd_intrinsic (int arity, int index)
+{
+#ifdef INTERP_ENABLE_SIMD
+       switch (arity) {
+               case 1:
+                       return interp_simd_p_p_table [index];
+               case 2:
+                       return interp_simd_p_pp_table [index];
+               case 3:
+                       return interp_simd_p_ppp_table [index];
+               default:
+                       g_assert_not_reached();
+       }
+#else
+       g_assert_not_reached();
+#endif
+}
+
+EMSCRIPTEN_KEEPALIVE int
+mono_jiterp_get_simd_opcode (int arity, int index)
+{
+#ifdef INTERP_ENABLE_SIMD
+       switch (arity) {
+               case 1:
+                       return interp_simd_p_p_wasm_opcode_table [index];
+               case 2:
+                       return interp_simd_p_pp_wasm_opcode_table [index];
+               case 3:
+                       return interp_simd_p_ppp_wasm_opcode_table [index];
+               default:
+                       g_assert_not_reached();
+       }
+#else
+       g_assert_not_reached();
+#endif
+}
+
 #endif
index 021a439..2849cec 100644 (file)
@@ -41,35 +41,35 @@ typedef enum {
 
 /* SIMD opcodes, grouped by signature */
 
-#define INTERP_SIMD_INTRINSIC_P_P(a,b)
-#define INTERP_SIMD_INTRINSIC_P_PP(a,b)
-#define INTERP_SIMD_INTRINSIC_P_PPP(a,b)
+#define INTERP_SIMD_INTRINSIC_P_P(a,b,c)
+#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c)
+#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c)
 
 #undef INTERP_SIMD_INTRINSIC_P_P
-#define INTERP_SIMD_INTRINSIC_P_P(a,b) a,
+#define INTERP_SIMD_INTRINSIC_P_P(a,b,c) a,
 typedef enum {
 #include "interp-simd-intrins.def"
 } MintSIMDOpsPP;
 #undef INTERP_SIMD_INTRINSIC_P_P
-#define INTERP_SIMD_INTRINSIC_P_P(a,b)
+#define INTERP_SIMD_INTRINSIC_P_P(a,b,c)
 
 #undef INTERP_SIMD_INTRINSIC_P_PP
-#define INTERP_SIMD_INTRINSIC_P_PP(a,b) a,
+#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c) a,
 typedef enum {
 #include "interp-simd-intrins.def"
        INTERP_SIMD_INTRINSIC_P_PP_LAST
 } MintSIMDOpsPPP;
 #undef INTERP_SIMD_INTRINSIC_P_PP
-#define INTERP_SIMD_INTRINSIC_P_PP(a,b)
+#define INTERP_SIMD_INTRINSIC_P_PP(a,b,c)
 
 #undef INTERP_SIMD_INTRINSIC_P_PPP
-#define INTERP_SIMD_INTRINSIC_P_PPP(a,b) a,
+#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c) a,
 typedef enum {
 #include "interp-simd-intrins.def"
        INTERP_SIMD_INTRINSIC_P_PPP_LAST
 } MintSIMDOpsPPPP;
 #undef INTERP_SIMD_INTRINSIC_P_PPP
-#define INTERP_SIMD_INTRINSIC_P_PPP(a,b)
+#define INTERP_SIMD_INTRINSIC_P_PPP(a,b,c)
 
 #if NO_UNALIGNED_ACCESS
 #  if G_BYTE_ORDER == G_LITTLE_ENDIAN
index 57b87d0..4eb76e1 100644 (file)
@@ -1,12 +1,14 @@
 SIMD_METHOD(get_Count)
 SIMD_METHOD(get_AllBitsSet)
 SIMD_METHOD(get_IsHardwareAccelerated)
+SIMD_METHOD(get_IsSupported)
 SIMD_METHOD(get_Item)
 SIMD_METHOD(get_One)
 SIMD_METHOD(get_Zero)
 SIMD_METHOD(op_Addition)
 SIMD_METHOD(op_BitwiseAnd)
 SIMD_METHOD(op_BitwiseOr)
+SIMD_METHOD(op_Division)
 SIMD_METHOD(op_Equality)
 SIMD_METHOD(op_ExclusiveOr)
 SIMD_METHOD(op_Explicit)
@@ -24,6 +26,7 @@ SIMD_METHOD(ConditionalSelect)
 SIMD_METHOD(Create)
 SIMD_METHOD(CreateScalar)
 SIMD_METHOD(CreateScalarUnsafe)
+
 SIMD_METHOD(Equals)
 SIMD_METHOD(ExtractMostSignificantBits)
 SIMD_METHOD(GreaterThan)
@@ -36,3 +39,20 @@ SIMD_METHOD(ShiftRightLogical)
 SIMD_METHOD(Shuffle)
 SIMD_METHOD(WidenLower)
 SIMD_METHOD(WidenUpper)
+
+// PackedSimd
+SIMD_METHOD(Splat)
+SIMD_METHOD(ExtractLane)
+SIMD_METHOD(ReplaceLane)
+SIMD_METHOD(Swizzle)
+SIMD_METHOD(Add)
+SIMD_METHOD(Subtract)
+SIMD_METHOD(Multiply)
+SIMD_METHOD(Dot)
+SIMD_METHOD(Negate)
+SIMD_METHOD(And)
+SIMD_METHOD(Bitmask)
+SIMD_METHOD(CompareEqual)
+SIMD_METHOD(CompareNotEqual)
+SIMD_METHOD(ConvertNarrowingSignedSaturate)
+SIMD_METHOD(ConvertNarrowingUnsignedSaturate)
index a46f755..bb7c269 100644 (file)
@@ -2,6 +2,8 @@
  * SIMD Intrinsics support for interpreter
  */
 
+#include "config.h"
+#include <glib.h>
 #include <mono/utils/bsearch.h>
 
 // We use the same approach as jit/aot for identifying simd methods.
@@ -61,7 +63,7 @@ static guint16 sri_vector128_methods [] = {
        SN_Shuffle,
        SN_WidenLower,
        SN_WidenUpper,
-       SN_get_IsHardwareAccelerated
+       SN_get_IsHardwareAccelerated,
 };
 
 static guint16 sri_vector128_t_methods [] = {
@@ -72,6 +74,7 @@ static guint16 sri_vector128_t_methods [] = {
        SN_op_Addition,
        SN_op_BitwiseAnd,
        SN_op_BitwiseOr,
+       SN_op_Division,
        SN_op_Equality,
        SN_op_ExclusiveOr,
        SN_op_Inequality,
@@ -84,6 +87,60 @@ static guint16 sri_vector128_t_methods [] = {
        SN_op_UnsignedRightShift
 };
 
+static guint16 sri_packedsimd_methods [] = {
+       SN_ConvertNarrowingSignedSaturate,
+       SN_ConvertNarrowingUnsignedSaturate,
+       SN_Swizzle,
+       SN_get_IsHardwareAccelerated,
+       SN_get_IsSupported,
+};
+
+#if HOST_BROWSER
+
+/*
+ * maps from INTERP_SIMD_INTRINSIC_WASM_I8X16_xxx to the correct one for the return type,
+ * assuming that they are laid out sequentially like this:
+ * INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I8X16_COMPAREEQUAL, wasm_i8x16_eq, 0x0)
+ * INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I16X8_COMPAREEQUAL, wasm_i16x8_eq, 0x0)
+ * INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I32X4_COMPAREEQUAL, wasm_i32x4_eq, 0x0)
+ * INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_I64X2_COMPAREEQUAL, wasm_i64x2_eq, 0x0)
+ * INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_F32X4_COMPAREEQUAL, wasm_f32x4_eq, 0x0)
+ * INTERP_WASM_SIMD_INTRINSIC_V_VV (INTERP_SIMD_INTRINSIC_WASM_F64X2_COMPAREEQUAL, wasm_f64x2_eq, 0x0)
+ * It is your responsibility to ensure that it's actually laid out this way!
+ */
+
+static int sri_packedsimd_offset_from_atype [] = {
+       -1, // MONO_TYPE_END        = 0x00,
+       -1, // MONO_TYPE_VOID       = 0x01,
+       // FIXME: Should this be 2, for I4?
+       0, // MONO_TYPE_BOOLEAN    = 0x02,
+       1, // MONO_TYPE_CHAR       = 0x03,
+       0, // MONO_TYPE_I1         = 0x04,
+       0, // MONO_TYPE_U1         = 0x05,
+       1, // MONO_TYPE_I2         = 0x06,
+       1, // MONO_TYPE_U2         = 0x07,
+       2, // MONO_TYPE_I4         = 0x08,
+       2, // MONO_TYPE_U4         = 0x09,
+       3, // MONO_TYPE_I8         = 0x0a,
+       3, // MONO_TYPE_U8         = 0x0b,
+       4, // MONO_TYPE_R4         = 0x0c,
+       5, // MONO_TYPE_R8         = 0x0d,
+       -1, // MONO_TYPE_STRING     = 0x0e,
+       2, // MONO_TYPE_PTR        = 0x0f,
+       -1, // MONO_TYPE_BYREF      = 0x10,
+       -1, // MONO_TYPE_VALUETYPE  = 0x11,
+       -1, // MONO_TYPE_CLASS      = 0x12,
+       -1, // MONO_TYPE_VAR         = 0x13,
+       -1, // MONO_TYPE_ARRAY      = 0x14,
+       -1, // MONO_TYPE_GENERICINST= 0x15,
+       -1, // MONO_TYPE_TYPEDBYREF = 0x16,
+       2, // MONO_TYPE_I          = 0x18,
+       2, // MONO_TYPE_U          = 0x19,
+};
+
+static const int sri_packedsimd_offset_from_atype_length = sizeof(sri_packedsimd_offset_from_atype) / sizeof(sri_packedsimd_offset_from_atype[0]);
+#endif // HOST_BROWSER
+
 static gboolean
 emit_sri_vector128 (TransformData *td, MonoMethod *cmethod, MonoMethodSignature *csignature)
 {
@@ -373,26 +430,36 @@ emit_sri_vector128_t (TransformData *td, MonoMethod *cmethod, MonoMethodSignatur
                        }
                        break;
                case SN_op_LeftShift:
-                       g_assert (scalar_arg == 1);
+                       if (scalar_arg != 1)
+                               return FALSE;
                        simd_opcode = MINT_SIMD_INTRINS_P_PP;
                        if (arg_size == 1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_LEFT_SHIFT;
                        else if (arg_size == 2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_LEFT_SHIFT;
                        else if (arg_size == 4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_LEFT_SHIFT;
                        else if (arg_size == 8) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I8_LEFT_SHIFT;
                        break;
+               case SN_op_Division:
+                       if (scalar_arg != -1)
+                               return FALSE;
+                       simd_opcode = MINT_SIMD_INTRINS_P_PP;
+                       if (atype == MONO_TYPE_R4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_R4_DIVISION;
+                       break;
                case SN_op_Multiply:
-                       g_assert (scalar_arg == -1);
+                       if (scalar_arg != -1)
+                               return FALSE;
                        simd_opcode = MINT_SIMD_INTRINS_P_PP;
                        if (atype == MONO_TYPE_I1 || atype == MONO_TYPE_U1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_MULTIPLY;
                        else if (atype == MONO_TYPE_I2 || atype == MONO_TYPE_U2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_MULTIPLY;
                        else if (atype == MONO_TYPE_I4 || atype == MONO_TYPE_U4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_MULTIPLY;
+                       else if (atype == MONO_TYPE_R4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_R4_MULTIPLY;
                        break;
                case SN_op_OnesComplement:
                        simd_opcode = MINT_SIMD_INTRINS_P_P;
                        simd_intrins = INTERP_SIMD_INTRINSIC_V128_ONES_COMPLEMENT;
                        break;
                case SN_op_RightShift:
-                       g_assert (scalar_arg == 1);
+                       if (scalar_arg != 1)
+                               return FALSE;
                        simd_opcode = MINT_SIMD_INTRINS_P_PP;
                        if (atype == MONO_TYPE_I1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_RIGHT_SHIFT;
                        else if (atype == MONO_TYPE_I2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_RIGHT_SHIFT;
@@ -414,7 +481,8 @@ emit_sri_vector128_t (TransformData *td, MonoMethod *cmethod, MonoMethodSignatur
                        else if (atype == MONO_TYPE_I4 || atype == MONO_TYPE_U4) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I4_NEGATION;
                        break;
                case SN_op_UnsignedRightShift:
-                       g_assert (scalar_arg == 1);
+                       if (scalar_arg != 1)
+                               return FALSE;
                        simd_opcode = MINT_SIMD_INTRINS_P_PP;
                        if (arg_size == 1) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I1_URIGHT_SHIFT;
                        else if (arg_size == 2) simd_intrins = INTERP_SIMD_INTRINSIC_V128_I2_URIGHT_SHIFT;
@@ -448,6 +516,182 @@ opcode_added:
        return TRUE;
 }
 
+#if HOST_BROWSER
+static int
+map_packedsimd_intrins_based_on_atype (MonoTypeEnum atype, int base_intrins, gboolean allow_float)
+{
+       int max_offset = allow_float ? 5 : 3;
+       if ((atype < 0) || (atype >= sri_packedsimd_offset_from_atype_length))
+               return -1;
+       int offset = sri_packedsimd_offset_from_atype [atype];
+       if ((offset < 0) || (offset > max_offset))
+               return -1;
+       return base_intrins + offset;
+}
+#endif
+
+static gboolean
+emit_sri_packedsimd (TransformData *td, MonoMethod *cmethod, MonoMethodSignature *csignature)
+{
+       int id = lookup_intrins (sri_packedsimd_methods, sizeof (sri_packedsimd_methods), cmethod);
+       if (id == -1)
+               return FALSE;
+
+       MonoClass *vector_klass = mono_class_from_mono_type_internal (csignature->ret);
+       int vector_size = -1;
+
+       if ((id == SN_get_IsSupported) || (id == SN_get_IsHardwareAccelerated)) {
+#if HOST_BROWSER
+               interp_add_ins (td, MINT_LDC_I4_1);
+#else
+               interp_add_ins (td, MINT_LDC_I4_0);
+#endif
+               goto opcode_added;
+       }
+
+#if HOST_BROWSER
+       gint16 simd_opcode = -1;
+       gint16 simd_intrins = -1;
+       if (!m_class_is_simd_type (vector_klass))
+               vector_klass = mono_class_from_mono_type_internal (csignature->params [0]);
+       if (!m_class_is_simd_type (vector_klass))
+               return FALSE;
+
+       vector_size = mono_class_value_size (vector_klass, NULL);
+       g_assert (vector_size == SIZEOF_V128);
+
+       MonoType *arg_type = mono_class_get_context (vector_klass)->class_inst->type_argv [0];
+       if (!mono_type_is_primitive (arg_type))
+               return FALSE;
+       MonoTypeEnum atype = arg_type->type;
+       if (atype == MONO_TYPE_BOOLEAN)
+               return FALSE;
+
+       int scalar_arg = -1;
+       for (int i = 0; i < csignature->param_count; i++) {
+               if (csignature->params [i]->type != MONO_TYPE_GENERICINST)
+                       scalar_arg = i;
+       }
+
+       switch (id) {
+               case SN_Splat: {
+                       simd_opcode = MINT_SIMD_INTRINS_P_P;
+                       simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_SPLAT, FALSE);
+                       break;
+               }
+               case SN_Swizzle: {
+                       simd_opcode = MINT_SIMD_INTRINS_P_PP;
+                       simd_intrins = INTERP_SIMD_INTRINSIC_WASM_I8X16_SWIZZLE;
+                       break;
+               }
+               case SN_Add: {
+                       simd_opcode = MINT_SIMD_INTRINS_P_PP;
+                       simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_ADD, FALSE);
+                       break;
+               }
+               case SN_Subtract: {
+                       simd_opcode = MINT_SIMD_INTRINS_P_PP;
+                       simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_SUBTRACT, FALSE);
+                       break;
+               }
+               case SN_Multiply: {
+                       simd_opcode = MINT_SIMD_INTRINS_P_PP;
+                       simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_MULTIPLY, FALSE);
+                       break;
+               }
+               case SN_Dot: {
+                       simd_opcode = MINT_SIMD_INTRINS_P_PP;
+                       simd_intrins = INTERP_SIMD_INTRINSIC_WASM_I32X4_DOT_I16X8;
+                       break;
+               }
+               case SN_Negate: {
+                       simd_opcode = MINT_SIMD_INTRINS_P_P;
+                       simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_NEGATE, FALSE);
+                       break;
+               }
+               case SN_ShiftLeft: {
+                       simd_opcode = MINT_SIMD_INTRINS_P_PP;
+                       simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_SHIFTLEFT, FALSE);
+                       break;
+               }
+               case SN_ShiftRightArithmetic: {
+                       simd_opcode = MINT_SIMD_INTRINS_P_PP;
+                       simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_SHIFTRIGHTARITHMETIC, FALSE);
+                       break;
+               }
+               case SN_ShiftRightLogical: {
+                       simd_opcode = MINT_SIMD_INTRINS_P_PP;
+                       simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_SHIFTRIGHTLOGICAL, FALSE);
+                       break;
+               }
+               case SN_And: {
+                       simd_opcode = MINT_SIMD_INTRINS_P_PP;
+                       simd_intrins = INTERP_SIMD_INTRINSIC_WASM_AND;
+                       break;
+               }
+               case SN_Bitmask: {
+                       simd_opcode = MINT_SIMD_INTRINS_P_P;
+                       simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_BITMASK, FALSE);
+                       break;
+               }
+               case SN_CompareEqual: {
+                       simd_opcode = MINT_SIMD_INTRINS_P_PP;
+                       simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_COMPAREEQUAL, TRUE);
+                       break;
+               }
+               case SN_CompareNotEqual: {
+                       simd_opcode = MINT_SIMD_INTRINS_P_PP;
+                       simd_intrins = map_packedsimd_intrins_based_on_atype (atype, INTERP_SIMD_INTRINSIC_WASM_I8X16_COMPARENOTEQUAL, TRUE);
+                       break;
+               }
+               case SN_ConvertNarrowingSignedSaturate: {
+                       simd_opcode = MINT_SIMD_INTRINS_P_PP;
+                       if (atype == MONO_TYPE_I1)
+                               simd_intrins = INTERP_SIMD_INTRINSIC_WASM_I8X16_NARROW_I16X8_S;
+                       else if (atype == MONO_TYPE_I2)
+                               simd_intrins = INTERP_SIMD_INTRINSIC_WASM_I16X8_NARROW_I32X4_S;
+                       break;
+               }
+               case SN_ConvertNarrowingUnsignedSaturate: {
+                       simd_opcode = MINT_SIMD_INTRINS_P_PP;
+                       if (atype == MONO_TYPE_U1)
+                               simd_intrins = INTERP_SIMD_INTRINSIC_WASM_I8X16_NARROW_I16X8_U;
+                       else if (atype == MONO_TYPE_U2)
+                               simd_intrins = INTERP_SIMD_INTRINSIC_WASM_I16X8_NARROW_I32X4_U;
+                       break;
+               }
+               default:
+                       return FALSE;
+       }
+
+       if (simd_opcode == -1 || simd_intrins == -1) {
+               return FALSE;
+       }
+
+       interp_add_ins (td, simd_opcode);
+       td->last_ins->data [0] = simd_intrins;
+#else // HOST_BROWSER
+       return FALSE;
+#endif // HOST_BROWSER
+
+opcode_added:
+       td->sp -= csignature->param_count;
+       for (int i = 0; i < csignature->param_count; i++)
+               td->last_ins->sregs [i] = td->sp [i].local;
+
+       g_assert (csignature->ret->type != MONO_TYPE_VOID);
+       int ret_mt = mint_type (csignature->ret);
+       if (ret_mt == MINT_TYPE_VT) {
+               // For these intrinsics, if we return a VT then it is a V128
+               push_type_vt (td, vector_klass, vector_size);
+       } else {
+               push_simple_type (td, stack_type [ret_mt]);
+       }
+       interp_ins_set_dreg (td->last_ins, td->sp [-1].local);
+       td->ip += 5;
+       return TRUE;
+}
+
 static gboolean
 interp_emit_simd_intrinsics (TransformData *td, MonoMethod *cmethod, MonoMethodSignature *csignature)
 {
@@ -461,11 +705,21 @@ interp_emit_simd_intrinsics (TransformData *td, MonoMethod *cmethod, MonoMethodS
        class_ns = m_class_get_name_space (cmethod->klass);
        class_name = m_class_get_name (cmethod->klass);
 
-       if (!strcmp (class_ns, "System.Runtime.Intrinsics")) {
+       if (mono_opt_interp_simd_v128 && !strcmp (class_ns, "System.Runtime.Intrinsics")) {
                if (!strcmp (class_name, "Vector128"))
                        return emit_sri_vector128 (td, cmethod, csignature);
                else if (!strcmp (class_name, "Vector128`1"))
                        return emit_sri_vector128_t (td, cmethod, csignature);
+       } else if (mono_opt_interp_simd_packedsimd && !strcmp (class_ns, "System.Runtime.Intrinsics.Wasm")) {
+               if (!strcmp (class_name, "PackedSimd")) {
+                       gboolean res = emit_sri_packedsimd (td, cmethod, csignature);
+#if HOST_BROWSER
+                       if (!res)
+                               g_print ("MONO interpreter: Unsupported method: System.Runtime.Intrinsics.Wasm.PackedSimd.%s\n", cmethod->name);
+                       g_assert (res);
+#endif
+                       return res;
+               }
        }
        return FALSE;
 }
index 1a6f58a..6d8715c 100644 (file)
@@ -60,6 +60,12 @@ DEFINE_BOOL_READONLY(readonly_flag, "readonly-flag", FALSE, "Example")
 DEFINE_BOOL(wasm_exceptions, "wasm-exceptions", FALSE, "Enable codegen for WASM exceptions")
 DEFINE_BOOL(wasm_gc_safepoints, "wasm-gc-safepoints", FALSE, "Use GC safepoints on WASM")
 DEFINE_BOOL(aot_lazy_assembly_load, "aot-lazy-assembly-load", FALSE, "Load assemblies referenced by AOT images lazily")
+#if HOST_BROWSER
+DEFINE_BOOL(interp_simd_v128, "interp-simd-v128", FALSE, "Enable interpreter Vector128 support")
+#else
+DEFINE_BOOL(interp_simd_v128, "interp-simd-v128", TRUE, "Enable interpreter Vector128 support")
+#endif
+DEFINE_BOOL(interp_simd_packedsimd, "interp-simd-packedsimd", FALSE, "Enable interpreter WASM PackedSimd support")
 
 #if HOST_BROWSER
 
@@ -110,6 +116,8 @@ DEFINE_BOOL(jiterpreter_use_constants, "jiterpreter-use-constants", FALSE, "Use
 DEFINE_BOOL(jiterpreter_eliminate_null_checks, "jiterpreter-eliminate-null-checks", TRUE, "Attempt to eliminate redundant null checks in traces")
 // enables performing backward branches without exiting traces
 DEFINE_BOOL(jiterpreter_backward_branches_enabled, "jiterpreter-backward-branches-enabled", TRUE, "Enable performing backward branches without exiting traces")
+// Attempt to use WASM v128 opcodes to implement SIMD interpreter opcodes
+DEFINE_BOOL(jiterpreter_enable_simd, "jiterpreter-simd-enabled", TRUE, "Attempt to use WebAssembly SIMD support")
 // When compiling a jit_call wrapper, bypass sharedvt wrappers if possible by inlining their
 //  logic into the compiled wrapper and calling the target AOTed function with native call convention
 DEFINE_BOOL(jiterpreter_direct_jit_call, "jiterpreter-direct-jit-calls", TRUE, "Bypass gsharedvt wrappers when compiling JIT call wrappers")
index 3433327..cb04d36 100644 (file)
@@ -15,6 +15,8 @@ namespace Sample
         {
             measurements = new Measurement[] {
                 new Create(),
+                new PackConstant(),
+                new Pack(),
                 new Add(),
                 new Multiply(),
                 new DotInt(),
@@ -56,6 +58,25 @@ namespace Sample
             public override void RunStep() => vector = Vector128.Create(0x123456);
         }
 
+        class PackConstant : VectorMeasurement
+        {
+            Vector128<int> vector;
+
+            public override string Name => "Pack Vector128 (Constant)";
+
+            public override void RunStep() => vector = Vector128.Create(1, 2, 3, 4);
+        }
+
+        class Pack : VectorMeasurement
+        {
+            Vector128<int> vector;
+            int a = 1, b = 2, c = 3, d = 4;
+
+            public override string Name => "Pack Vector128";
+
+            public override void RunStep() => vector = Vector128.Create(a, b, c, d);
+        }
+
         class Add : VectorMeasurement
         {
             Vector128<int> vector1, vector2, vector3;
index 4d3781b..1a39d15 100644 (file)
@@ -39,6 +39,8 @@ set_target_properties(dotnet PROPERTIES
     RUNTIME_OUTPUT_DIRECTORY "${NATIVE_BIN_DIR}")
 
 set(ignoreMeWasmOptFlags "${CONFIGURATION_WASM_OPT_FLAGS}")
+set(ignoreMeWasmOptAdditionalFlags "${WASM_OPT_ADDITIONAL_FLAGS}")
+set(ignoreMeEmsdkPath "${EMSDK_PATH}")
 
 if(CMAKE_BUILD_TYPE STREQUAL "Release")
     add_custom_command(TARGET dotnet
index 6c6912f..3b0a982 100644 (file)
@@ -125,6 +125,8 @@ const fn_signatures: SigLine[] = [
     [true, "mono_jiterp_boost_back_branch_target", "void", ["number"]],
     [true, "mono_jiterp_is_imethod_var_address_taken", "number", ["number", "number"]],
     [true, "mono_jiterp_get_opcode_value_table_entry", "number", ["number"]],
+    [true, "mono_jiterp_get_simd_intrinsic", "number", ["number", "number"]],
+    [true, "mono_jiterp_get_simd_opcode", "number", ["number", "number"]],
     ...legacy_interop_cwraps
 ];
 
@@ -246,6 +248,8 @@ export interface t_Cwraps {
     mono_jiterp_boost_back_branch_target(destination: number): void;
     mono_jiterp_is_imethod_var_address_taken(imethod: VoidPtr, offsetBytes: number): number;
     mono_jiterp_get_opcode_value_table_entry(opcode: number): number;
+    mono_jiterp_get_simd_intrinsic(arity: number, index: number): VoidPtr;
+    mono_jiterp_get_simd_opcode(arity: number, index: number): number;
 }
 
 const wrapped_c_functions: t_Cwraps = <any>{};
index 510b1db..de7f6e5 100755 (executable)
@@ -8,20 +8,25 @@ import sys
 import os
 import re
 
-if len (sys.argv) != 3:
-    print ("Usage: genmintops.py <src/mintops.def> <dest/mintops.ts>")
+if len (sys.argv) != 4:
+    print ("Usage: genmintops.py <src/mintops.def> <src/interp-simd-intrins.def> <dest/mintops.ts>")
     exit (1)
 
 src_header_path = sys.argv [1]
-output_ts_path = sys.argv [2]
+simd_header_path = sys.argv [2]
+output_ts_path = sys.argv [3]
 
 src = open(src_header_path, 'r')
+simd_src = open(simd_header_path, 'r')
 
 tab = "    "
 header_lines = src.read().splitlines()
+# strip preprocessing directives
+simd_header_lines = (l for l in simd_src.read().splitlines() if not l.startswith("#"))
 # strip preprocessing directives and add indentation for tslint/eslint
 header = "\n".join((tab + l) for l in header_lines if not l.startswith("#"))
 src.close()
+simd_src.close()
 
 opdef_regex = r'\s(IR)?OPDEF\((\w+),\s*(.+?),\s*(MintOp\w+)\)'
 enum_values = re.sub(
@@ -31,11 +36,36 @@ metadata_table = re.sub(
     opdef_regex, lambda m : f"[MintOpcode.{m.group(2)}]: [{m.group(3)}, MintOpArgType.{m.group(4)}],", header
 )
 
+simd_values_1 = []
+simd_values_2 = []
+simd_values_3 = []
+simd_disp = {
+    "INTERP_SIMD_INTRINSIC_P_P": simd_values_1,
+    "INTERP_SIMD_INTRINSIC_P_PP": simd_values_2,
+    "INTERP_SIMD_INTRINSIC_P_PPP": simd_values_3,
+    "INTERP_WASM_SIMD_INTRINSIC_V_P": simd_values_1,
+    "INTERP_WASM_SIMD_INTRINSIC_V_V": simd_values_1,
+    "INTERP_WASM_SIMD_INTRINSIC_I_V": simd_values_1,
+    "INTERP_WASM_SIMD_INTRINSIC_V_VV": simd_values_2,
+    "INTERP_WASM_SIMD_INTRINSIC_V_VI": simd_values_2,
+    "INTERP_WASM_SIMD_INTRINSIC_V_VVV": simd_values_3,
+}
+
+for line in simd_header_lines:
+    idx1 = line.index("(") if "(" in line else None
+    idx2 = line.index(",") if "," in line else None
+    if (idx1 and idx2):
+        key = line[0:idx1].strip()
+        simd_disp[key].append(line[(idx1 + 1):idx2].strip().replace("INTERP_SIMD_INTRINSIC_", ""))
+
+splitter = ",\n    "
+splitter2 = ",\n        "
+
 generated = f"""
 // Generated by genmintops.py from mintops.def.
 // Do not manually edit this file.
 
-import {{ OpcodeInfoTable, MintOpArgType }} from "./jiterpreter-opcodes";
+import {{ OpcodeInfoTable, MintOpArgType, SimdInfoTable }} from "./jiterpreter-opcodes";
 
 export const enum MintOpcode {{
 {enum_values}
@@ -46,6 +76,30 @@ export const enum MintOpcode {{
 export const OpcodeInfo : OpcodeInfoTable = {{
 {metadata_table}
 }};
+
+export const enum SimdIntrinsic2 {{
+    {splitter.join(simd_values_1)}
+}}
+
+export const enum SimdIntrinsic3 {{
+    {splitter.join(simd_values_2)}
+}}
+
+export const enum SimdIntrinsic4 {{
+    {splitter.join(simd_values_3)}
+}}
+
+export const SimdInfo : SimdInfoTable = {{
+    2: [
+        {splitter2.join(repr(x) for x in simd_values_1)}
+    ],
+    3: [
+        {splitter2.join(repr(x) for x in simd_values_2)}
+    ],
+    4: [
+        {splitter2.join(repr(x) for x in simd_values_3)}
+    ],
+}};
 """
 
 os.makedirs(os.path.dirname(output_ts_path), exist_ok=True)
index aad65e7..14651ad 100644 (file)
@@ -289,7 +289,7 @@ function flush_wasm_entry_trampoline_jit_queue() {
         // Emit function imports
         for (let i = 0; i < trampImports.length; i++) {
             mono_assert(trampImports[i], () => `trace #${i} missing`);
-            builder.defineImportedFunction("i", trampImports[i][0], trampImports[i][1], true, false, trampImports[i][2]);
+            builder.defineImportedFunction("i", trampImports[i][0], trampImports[i][1], true, trampImports[i][2]);
         }
 
         builder._generateImportSection();
@@ -343,12 +343,9 @@ function flush_wasm_entry_trampoline_jit_queue() {
             console.log(`jit queue generated ${buffer.length} byte(s) of wasm`);
         counters.bytesGenerated += buffer.length;
         const traceModule = new WebAssembly.Module(buffer);
+        const wasmImports = builder.getWasmImports();
 
-        const traceInstance = new WebAssembly.Instance(traceModule, {
-            i: builder.getImportedFunctionTable(),
-            c: <any>builder.getConstants(),
-            m: { h: (<any>Module).asm.memory },
-        });
+        const traceInstance = new WebAssembly.Instance(traceModule, wasmImports);
 
         // Now that we've jitted the trampolines, go through and fix up the function pointers
         //  to point to the new jitted trampolines instead of the default implementations
index 3f016ad..b1bf6a5 100644 (file)
@@ -11,7 +11,7 @@ import { WasmOpcode } from "./jiterpreter-opcodes";
 import {
     WasmValtype, WasmBuilder, addWasmFunctionPointer as addWasmFunctionPointer,
     _now, elapsedTimes, counters, getWasmFunctionTable, applyOptions,
-    recordFailure, getOptions
+    recordFailure, getOptions, bytesFromHex
 } from "./jiterpreter-support";
 import cwraps from "./cwraps";
 
@@ -157,7 +157,7 @@ class TrampolineInfo {
 }
 
 // this is cached replacements for Module.getWasmTableEntry();
-// we could add <EmccExportedLibraryFunction Include="$getWasmTableEntry" /> and <EmccExportedRuntimeMethod Include="getWasmTableEntry" /> 
+// we could add <EmccExportedLibraryFunction Include="$getWasmTableEntry" /> and <EmccExportedRuntimeMethod Include="getWasmTableEntry" />
 // if we need to export the original
 function getWasmTableEntry(index: number) {
     let result = fnCache[index];
@@ -236,9 +236,7 @@ function getIsWasmEhSupported(): boolean {
     // Probe whether the current environment can handle wasm exceptions
     try {
         // Load and compile the wasm version of do_jit_call_indirect. This serves as a way to probe for wasm EH
-        const bytes = new Uint8Array(doJitCall16.length / 2);
-        for (let i = 0; i < doJitCall16.length; i += 2)
-            bytes[i / 2] = parseInt(doJitCall16.substring(i, i + 2), 16);
+        const bytes = bytesFromHex(doJitCall16);
 
         counters.bytesGenerated += bytes.length;
         doJitCallModule = new WebAssembly.Module(bytes);
@@ -396,7 +394,7 @@ export function mono_interp_flush_jitcall_queue(): void {
 
         // Emit function imports
         for (let i = 0; i < trampImports.length; i++)
-            builder.defineImportedFunction("i", trampImports[i][0], trampImports[i][1], true, false, trampImports[i][2]);
+            builder.defineImportedFunction("i", trampImports[i][0], trampImports[i][1], true, trampImports[i][2]);
         builder._generateImportSection();
 
         // Function section
@@ -444,12 +442,9 @@ export function mono_interp_flush_jitcall_queue(): void {
             console.log(`do_jit_call queue flush generated ${buffer.length} byte(s) of wasm`);
         counters.bytesGenerated += buffer.length;
         const traceModule = new WebAssembly.Module(buffer);
+        const wasmImports = builder.getWasmImports();
 
-        const traceInstance = new WebAssembly.Instance(traceModule, {
-            i: builder.getImportedFunctionTable(),
-            c: <any>builder.getConstants(),
-            m: { h: (<any>Module).asm.memory }
-        });
+        const traceInstance = new WebAssembly.Instance(traceModule, wasmImports);
 
         for (let i = 0; i < jitQueue.length; i++) {
             const info = jitQueue[i];
index ccefa7b..3d347bd 100644 (file)
@@ -30,6 +30,12 @@ export type OpcodeInfoTable = {
     [key: number]: [name: string, length_u16: number, dregs: number, sregs: number, optype: MintOpArgType];
 }
 
+export type SimdInfoSubtable = Array<string>
+
+export type SimdInfoTable = {
+    [argument_count: number] : SimdInfoSubtable
+}
+
 // Keep this in sync with the wasm spec (but I don't think any changes will impact it),
 // Note that prefix opcodes aren't in this enum, since making them write properly is awkward.
 
@@ -229,5 +235,245 @@ export const enum WasmOpcode {
     i64_extend_32_s,
 
     PREFIX_sat = 0xfc,
+    PREFIX_simd = 0xfd,
     PREFIX_atomic = 0xfe
 }
+
+export const enum WasmSimdOpcode {
+    v128_load                     = 0x00,
+    v128_load8x8_s                = 0x01,
+    v128_load8x8_u                = 0x02,
+    v128_load16x4_s               = 0x03,
+    v128_load16x4_u               = 0x04,
+    v128_load32x2_s               = 0x05,
+    v128_load32x2_u               = 0x06,
+    v128_load8_splat              = 0x07,
+    v128_load16_splat             = 0x08,
+    v128_load32_splat             = 0x09,
+    v128_load64_splat             = 0x0a,
+    v128_store                    = 0x0b,
+    v128_const                    = 0x0c,
+    i8x16_shuffle                 = 0x0d,
+    i8x16_swizzle                 = 0x0e,
+    i8x16_splat                   = 0x0f,
+    i16x8_splat                   = 0x10,
+    i32x4_splat                   = 0x11,
+    i64x2_splat                   = 0x12,
+    f32x4_splat                   = 0x13,
+    f64x2_splat                   = 0x14,
+    i8x16_extract_lane_s          = 0x15,
+    i8x16_extract_lane_u          = 0x16,
+    i8x16_replace_lane            = 0x17,
+    i16x8_extract_lane_s          = 0x18,
+    i16x8_extract_lane_u          = 0x19,
+    i16x8_replace_lane            = 0x1a,
+    i32x4_extract_lane            = 0x1b,
+    i32x4_replace_lane            = 0x1c,
+    i64x2_extract_lane            = 0x1d,
+    i64x2_replace_lane            = 0x1e,
+    f32x4_extract_lane            = 0x1f,
+    f32x4_replace_lane            = 0x20,
+    f64x2_extract_lane            = 0x21,
+    f64x2_replace_lane            = 0x22,
+    i8x16_eq                      = 0x23,
+    i8x16_ne                      = 0x24,
+    i8x16_lt_s                    = 0x25,
+    i8x16_lt_u                    = 0x26,
+    i8x16_gt_s                    = 0x27,
+    i8x16_gt_u                    = 0x28,
+    i8x16_le_s                    = 0x29,
+    i8x16_le_u                    = 0x2a,
+    i8x16_ge_s                    = 0x2b,
+    i8x16_ge_u                    = 0x2c,
+    i16x8_eq                      = 0x2d,
+    i16x8_ne                      = 0x2e,
+    i16x8_lt_s                    = 0x2f,
+    i16x8_lt_u                    = 0x30,
+    i16x8_gt_s                    = 0x31,
+    i16x8_gt_u                    = 0x32,
+    i16x8_le_s                    = 0x33,
+    i16x8_le_u                    = 0x34,
+    i16x8_ge_s                    = 0x35,
+    i16x8_ge_u                    = 0x36,
+    i32x4_eq                      = 0x37,
+    i32x4_ne                      = 0x38,
+    i32x4_lt_s                    = 0x39,
+    i32x4_lt_u                    = 0x3a,
+    i32x4_gt_s                    = 0x3b,
+    i32x4_gt_u                    = 0x3c,
+    i32x4_le_s                    = 0x3d,
+    i32x4_le_u                    = 0x3e,
+    i32x4_ge_s                    = 0x3f,
+    i32x4_ge_u                    = 0x40,
+    f32x4_eq                      = 0x41,
+    f32x4_ne                      = 0x42,
+    f32x4_lt                      = 0x43,
+    f32x4_gt                      = 0x44,
+    f32x4_le                      = 0x45,
+    f32x4_ge                      = 0x46,
+    f64x2_eq                      = 0x47,
+    f64x2_ne                      = 0x48,
+    f64x2_lt                      = 0x49,
+    f64x2_gt                      = 0x4a,
+    f64x2_le                      = 0x4b,
+    f64x2_ge                      = 0x4c,
+    v128_not                      = 0x4d,
+    v128_and                      = 0x4e,
+    v128_andnot                   = 0x4f,
+    v128_or                       = 0x50,
+    v128_xor                      = 0x51,
+    v128_bitselect                = 0x52,
+    i8x16_abs                     = 0x60,
+    i8x16_neg                     = 0x61,
+    i8x16_all_true                = 0x63,
+    i8x16_bitmask                 = 0x64,
+    i8x16_narrow_i16x8_s          = 0x65,
+    i8x16_narrow_i16x8_u          = 0x66,
+    i8x16_shl                     = 0x6b,
+    i8x16_shr_s                   = 0x6c,
+    i8x16_shr_u                   = 0x6d,
+    i8x16_add                     = 0x6e,
+    i8x16_add_sat_s               = 0x6f,
+    i8x16_add_sat_u               = 0x70,
+    i8x16_sub                     = 0x71,
+    i8x16_sub_sat_s               = 0x72,
+    i8x16_sub_sat_u               = 0x73,
+    i8x16_min_s                   = 0x76,
+    i8x16_min_u                   = 0x77,
+    i8x16_max_s                   = 0x78,
+    i8x16_max_u                   = 0x79,
+    i8x16_avgr_u                  = 0x7b,
+    i16x8_abs                     = 0x80,
+    i16x8_neg                     = 0x81,
+    i16x8_all_true                = 0x83,
+    i16x8_bitmask                 = 0x84,
+    i16x8_narrow_i32x4_s          = 0x85,
+    i16x8_narrow_i32x4_u          = 0x86,
+    i16x8_extend_low_i8x16_s      = 0x87,
+    i16x8_extend_high_i8x16_s     = 0x88,
+    i16x8_extend_low_i8x16_u      = 0x89,
+    i16x8_extend_high_i8x16_u     = 0x8a,
+    i16x8_shl                     = 0x8b,
+    i16x8_shr_s                   = 0x8c,
+    i16x8_shr_u                   = 0x8d,
+    i16x8_add                     = 0x8e,
+    i16x8_add_sat_s               = 0x8f,
+    i16x8_add_sat_u               = 0x90,
+    i16x8_sub                     = 0x91,
+    i16x8_sub_sat_s               = 0x92,
+    i16x8_sub_sat_u               = 0x93,
+    i16x8_mul                     = 0x95,
+    i16x8_min_s                   = 0x96,
+    i16x8_min_u                   = 0x97,
+    i16x8_max_s                   = 0x98,
+    i16x8_max_u                   = 0x99,
+    i16x8_avgr_u                  = 0x9b,
+    i32x4_abs                     = 0xa0,
+    i32x4_neg                     = 0xa1,
+    i32x4_all_true                = 0xa3,
+    i32x4_bitmask                 = 0xa4,
+    i32x4_extend_low_i16x8_s      = 0xa7,
+    i32x4_extend_high_i16x8_s     = 0xa8,
+    i32x4_extend_low_i16x8_u      = 0xa9,
+    i32x4_extend_high_i16x8_u     = 0xaa,
+    i32x4_shl                     = 0xab,
+    i32x4_shr_s                   = 0xac,
+    i32x4_shr_u                   = 0xad,
+    i32x4_add                     = 0xae,
+    i32x4_sub                     = 0xb1,
+    i32x4_mul                     = 0xb5,
+    i32x4_min_s                   = 0xb6,
+    i32x4_min_u                   = 0xb7,
+    i32x4_max_s                   = 0xb8,
+    i32x4_max_u                   = 0xb9,
+    i32x4_dot_i16x8_s             = 0xba,
+    i64x2_abs                     = 0xc0,
+    i64x2_neg                     = 0xc1,
+    i64x2_bitmask                 = 0xc4,
+    i64x2_extend_low_i32x4_s      = 0xc7,
+    i64x2_extend_high_i32x4_s     = 0xc8,
+    i64x2_extend_low_i32x4_u      = 0xc9,
+    i64x2_extend_high_i32x4_u     = 0xca,
+    i64x2_shl                     = 0xcb,
+    i64x2_shr_s                   = 0xcc,
+    i64x2_shr_u                   = 0xcd,
+    i64x2_add                     = 0xce,
+    i64x2_sub                     = 0xd1,
+    i64x2_mul                     = 0xd5,
+    f32x4_ceil                    = 0x67,
+    f32x4_floor                   = 0x68,
+    f32x4_trunc                   = 0x69,
+    f32x4_nearest                 = 0x6a,
+    f64x2_ceil                    = 0x74,
+    f64x2_floor                   = 0x75,
+    f64x2_trunc                   = 0x7a,
+    f64x2_nearest                 = 0x94,
+    f32x4_abs                     = 0xe0,
+    f32x4_neg                     = 0xe1,
+    f32x4_sqrt                    = 0xe3,
+    f32x4_add                     = 0xe4,
+    f32x4_sub                     = 0xe5,
+    f32x4_mul                     = 0xe6,
+    f32x4_div                     = 0xe7,
+    f32x4_min                     = 0xe8,
+    f32x4_max                     = 0xe9,
+    f32x4_pmin                    = 0xea,
+    f32x4_pmax                    = 0xeb,
+    f64x2_abs                     = 0xec,
+    f64x2_neg                     = 0xed,
+    f64x2_sqrt                    = 0xef,
+    f64x2_add                     = 0xf0,
+    f64x2_sub                     = 0xf1,
+    f64x2_mul                     = 0xf2,
+    f64x2_div                     = 0xf3,
+    f64x2_min                     = 0xf4,
+    f64x2_max                     = 0xf5,
+    f64x2_pmin                    = 0xf6,
+    f64x2_pmax                    = 0xf7,
+    i32x4_trunc_sat_f32x4_s       = 0xf8,
+    i32x4_trunc_sat_f32x4_u       = 0xf9,
+    f32x4_convert_i32x4_s         = 0xfa,
+    f32x4_convert_i32x4_u         = 0xfb,
+    v128_load32_zero              = 0x5c,
+    v128_load64_zero              = 0x5d,
+    i16x8_extmul_low_i8x16_s      = 0x9c,
+    i16x8_extmul_high_i8x16_s     = 0x9d,
+    i16x8_extmul_low_i8x16_u      = 0x9e,
+    i16x8_extmul_high_i8x16_u     = 0x9f,
+    i32x4_extmul_low_i16x8_s      = 0xbc,
+    i32x4_extmul_high_i16x8_s     = 0xbd,
+    i32x4_extmul_low_i16x8_u      = 0xbe,
+    i32x4_extmul_high_i16x8_u     = 0xbf,
+    i64x2_extmul_low_i32x4_s      = 0xdc,
+    i64x2_extmul_high_i32x4_s     = 0xdd,
+    i64x2_extmul_low_i32x4_u      = 0xde,
+    i64x2_extmul_high_i32x4_u     = 0xdf,
+    i16x8_q15mulr_sat_s           = 0x82,
+    v128_any_true                 = 0x53,
+    v128_load8_lane               = 0x54,
+    v128_load16_lane              = 0x55,
+    v128_load32_lane              = 0x56,
+    v128_load64_lane              = 0x57,
+    v128_store8_lane              = 0x58,
+    v128_store16_lane             = 0x59,
+    v128_store32_lane             = 0x5a,
+    v128_store64_lane             = 0x5b,
+    i64x2_eq                      = 0xd6,
+    i64x2_ne                      = 0xd7,
+    i64x2_lt_s                    = 0xd8,
+    i64x2_gt_s                    = 0xd9,
+    i64x2_le_s                    = 0xda,
+    i64x2_ge_s                    = 0xdb,
+    i64x2_all_true                = 0xc3,
+    f64x2_convert_low_i32x4_s     = 0xfe,
+    f64x2_convert_low_i32x4_u     = 0xff,
+    i32x4_trunc_sat_f64x2_s_zero  = 0xfc,
+    i32x4_trunc_sat_f64x2_u_zero  = 0xfd,
+    f32x4_demote_f64x2_zero       = 0x5e,
+    f64x2_promote_low_f32x4       = 0x5f,
+    i8x16_popcnt                  = 0x62,
+    i16x8_extadd_pairwise_i8x16_s = 0x7c,
+    i16x8_extadd_pairwise_i8x16_u = 0x7d,
+    i32x4_extadd_pairwise_i16x8_s = 0x7e,
+    i32x4_extadd_pairwise_i16x8_u = 0x7f,
+}
index 010e674..3067575 100644 (file)
@@ -4,7 +4,7 @@
 import { mono_assert } from "./types";
 import { NativePointer, ManagedPointer, VoidPtr } from "./types/emscripten";
 import { Module, runtimeHelpers } from "./globals";
-import { WasmOpcode } from "./jiterpreter-opcodes";
+import { WasmOpcode, WasmSimdOpcode } from "./jiterpreter-opcodes";
 import { MintOpcode } from "./mintops";
 import cwraps from "./cwraps";
 
@@ -118,7 +118,6 @@ type ImportedFunctionInfo = {
     typeIndex: number;
     module: string;
     name: string;
-    assumeUsed: boolean;
     func: Function;
 }
 
@@ -166,6 +165,7 @@ export class WasmBuilder {
     nextConstantSlot = 0;
 
     compressImportNames = false;
+    lockImports = false;
 
     constructor(constantSlotCount: number) {
         this.stack = [new BlobBuilder()];
@@ -178,6 +178,7 @@ export class WasmBuilder {
         this.stackSize = 1;
         this.inSection = false;
         this.inFunction = false;
+        this.lockImports = false;
         this.locals.clear();
 
         this.functionTypeCount = this.permanentFunctionTypeCount;
@@ -186,13 +187,12 @@ export class WasmBuilder {
         this.functionTypesByIndex = Object.create(this.permanentFunctionTypesByIndex);
 
         this.nextImportIndex = 0;
-        this.importedFunctionCount = this.permanentImportedFunctionCount;
+        this.importedFunctionCount = 0;
         this.importedFunctions = Object.create(this.permanentImportedFunctions);
 
         for (const k in this.importedFunctions) {
             const f = this.importedFunctions[k];
-            if (!f.assumeUsed)
-                f.index = undefined;
+            f.index = undefined;
         }
 
         this.functions.length = 0;
@@ -235,15 +235,45 @@ export class WasmBuilder {
             return current.getArrayView(false).slice(0, current.size);
     }
 
+    getWasmImports () : WebAssembly.Imports {
+        const result : any = {
+            c: <any>this.getConstants(),
+            m: { h: (<any>Module).asm.memory },
+            f: { f: getWasmFunctionTable() },
+        };
+
+        const importsToEmit = this.getImportsToEmit();
+
+        for (let i = 0; i < importsToEmit.length; i++) {
+            const ifi = importsToEmit[i];
+            if (typeof (ifi.func) !== "function")
+                throw new Error(`Import '${ifi.name}' not found or not a function`);
+
+            const mangledName = this.getCompressedName(ifi);
+            let subTable = result[ifi.module];
+            if (!subTable) {
+                subTable = result[ifi.module] = {};
+            }
+            subTable[mangledName] = ifi.func;
+        }
+
+        return result;
+    }
+
     // HACK: Approximate amount of space we need to generate the full module at present
     // FIXME: This does not take into account any other functions already generated if they weren't
     //  emitted into the module immediately
-    get bytesGeneratedSoFar() {
+    get bytesGeneratedSoFar () {
+        const importSize = this.compressImportNames
+            // mod (2 bytes) name (2-3 bytes) type (1 byte) typeidx (1-2 bytes)
+            ? 8
+            // we keep the uncompressed import names somewhat short, generally, so +12 bytes is about right
+            : 20;
+
         return this.stack[0].size +
             // HACK: A random constant for section headers and padding
             32 +
-            // mod (2 bytes) name (2-3 bytes) type (1 byte) typeidx (1-2 bytes)
-            (this.importedFunctionCount * 8) +
+            (this.importedFunctionCount * importSize) +
             // type index for each function
             (this.functions.length * 2) +
             // export entry for each export
@@ -264,7 +294,13 @@ export class WasmBuilder {
         return this.current.appendU8(value);
     }
 
-    appendU32(value: number) {
+    appendSimd (value: WasmSimdOpcode) {
+        this.current.appendU8(WasmOpcode.PREFIX_simd);
+        // Yes that's right. We're using LEB128 to encode 8-bit opcodes. Why? I don't know
+        return this.current.appendULeb(value);
+    }
+
+    appendU32 (value: number) {
         return this.current.appendU32(value);
     }
 
@@ -424,8 +460,8 @@ export class WasmBuilder {
         return imports;
     }
 
-    getCompressedName(ifi: ImportedFunctionInfo) {
-        if (!this.compressImportNames || typeof (ifi.index) !== "number")
+    getCompressedName (ifi: ImportedFunctionInfo) {
+        if (!this.compressImportNames || typeof(ifi.index) !== "number")
             return ifi.name;
 
         let result = compressedNameCache[ifi.index!];
@@ -434,23 +470,31 @@ export class WasmBuilder {
         return result;
     }
 
-    _generateImportSection() {
-        const importsToEmit = [];
+    getImportsToEmit () {
+        const result = [];
         for (const k in this.importedFunctions) {
-            const f = this.importedFunctions[k];
-            if (f.index !== undefined)
-                importsToEmit.push(f);
+            const v = this.importedFunctions[k];
+            if (typeof (v.index) !== "number")
+                continue;
+            result.push(v);
         }
-        importsToEmit.sort((lhs, rhs) => lhs.index! - rhs.index!);
+        result.sort((lhs, rhs) => lhs.index! - rhs.index!);
+        // console.log("result=[" + result.map(f => `#${f.index} ${f.module}.${f.name}`) + "]");
+        return result;
+    }
+
+    _generateImportSection () {
+        const importsToEmit = this.getImportsToEmit();
+        this.lockImports = true;
 
         // Import section
         this.beginSection(2);
-        this.appendULeb(1 + importsToEmit.length + this.constantSlots.length);
+        this.appendULeb(2 + importsToEmit.length + this.constantSlots.length);
 
-        // console.log(`referenced ${importsToEmit.length}/${allImports.length} import(s)`);
+        // console.log(`referenced ${importsToEmit.length} import(s)`);
         for (let i = 0; i < importsToEmit.length; i++) {
             const ifi = importsToEmit[i];
-            // console.log(`  #${ifi.index} ${ifi.module}.${ifi.name} = ${ifi.friendlyName}`);
+            // console.log(`  #${ifi.index} ${ifi.module}.${ifi.name} = ${ifi.func}`);
             this.appendName(ifi.module);
             this.appendName(this.getCompressedName(ifi));
             this.appendU8(0x0); // function
@@ -472,14 +516,26 @@ export class WasmBuilder {
         this.appendU8(0x00);
         // Minimum size is in 64k pages, not bytes
         this.appendULeb(0x01);
+
+        this.appendName("f");
+        this.appendName("f");
+        // tabletype
+        this.appendU8(0x01);
+        // funcref
+        this.appendU8(0x70);
+        // limits = { min=0x01, max=infinity }
+        this.appendU8(0x00);
+        this.appendULeb(0x01);
     }
 
     defineImportedFunction(
         module: string, name: string, functionTypeName: string,
-        assumeUsed: boolean, permanent: boolean, func: Function | number
-    ): ImportedFunctionInfo {
-        if (permanent && (this.importedFunctionCount > this.permanentImportedFunctionCount))
-            throw new Error("New permanent imports cannot be defined after non-permanent ones");
+        permanent: boolean, func: Function | number
+    ) : ImportedFunctionInfo {
+        if (this.lockImports)
+            throw new Error("Import section already generated");
+        if (permanent && (this.importedFunctionCount > 0))
+            throw new Error("New permanent imports cannot be defined after any indexes have been assigned");
         const type = this.functionTypes[functionTypeName];
         if (!type)
             throw new Error("No function type named " + functionTypeName);
@@ -487,23 +543,15 @@ export class WasmBuilder {
             throw new Error("A permanent import must have a permanent function type");
         const typeIndex = type[0];
         const table = permanent ? this.permanentImportedFunctions : this.importedFunctions;
-        const index = assumeUsed
-            ? (
-                permanent
-                    ? this.permanentImportedFunctionCount++
-                    : this.importedFunctionCount++
-            )
-            : undefined;
         if (typeof (func) === "number")
             func = getWasmFunctionTable().get(func);
         if (typeof (func) !== "function")
             throw new Error(`Value passed for imported function ${name} was not a function or valid function pointer`);
         const result = table[name] = {
-            index,
+            index: undefined,
             typeIndex,
             module,
             name,
-            assumeUsed,
             func
         };
         return result;
@@ -581,11 +629,21 @@ export class WasmBuilder {
         this.endSection();
     }
 
-    callImport(name: string) {
+    call_indirect (functionTypeName: string, tableIndex: number) {
+        const type = this.functionTypes[functionTypeName];
+        if (!type)
+            throw new Error("No function type named " + functionTypeName);
+        const typeIndex = type[0];
+        this.appendU8(WasmOpcode.call_indirect);
+        this.appendULeb(typeIndex);
+        this.appendULeb(tableIndex);
+    }
+
+    callImport (name: string) {
         const func = this.importedFunctions[name];
         if (!func)
             throw new Error("No imported function named " + name);
-        if (func.index === undefined)
+        if (typeof (func.index) !== "number")
             func.index = this.importedFunctionCount++;
         this.appendU8(WasmOpcode.call);
         this.appendULeb(func.index);
@@ -1325,6 +1383,9 @@ export const elapsedTimes = {
     compilation: 0
 };
 
+export const simdFallbackCounters : { [name: string] : number } = {
+};
+
 export const counters = {
     traceCandidates: 0,
     tracesCompiled: 0,
@@ -1336,6 +1397,7 @@ export const counters = {
     nullChecksEliminated: 0,
     backBranchesEmitted: 0,
     backBranchesNotEmitted: 0,
+    simdFallback: simdFallbackCounters,
 };
 
 export const _now = (globalThis.performance && globalThis.performance.now)
@@ -1636,6 +1698,13 @@ export function importDef(name: string, fn: Function): [string, string, Function
     return [name, name, fn];
 }
 
+export function bytesFromHex (hex: string) : Uint8Array {
+    const bytes = new Uint8Array(hex.length / 2);
+    for (let i = 0; i < hex.length; i += 2)
+        bytes[i / 2] = parseInt(hex.substring(i, i + 2), 16);
+    return bytes;
+}
+
 export type JiterpreterOptions = {
     enableAll?: boolean;
     enableTraces: boolean;
@@ -1644,6 +1713,7 @@ export type JiterpreterOptions = {
     enableBackwardBranches: boolean;
     enableCallResume: boolean;
     enableWasmEh: boolean;
+    enableSimd: boolean;
     // For locations where the jiterpreter heuristic says we will be unable to generate
     //  a trace, insert an entry point opcode anyway. This enables collecting accurate
     //  stats for options like estimateHeat, but raises overhead.
@@ -1685,6 +1755,7 @@ const optionNames: { [jsName: string]: string } = {
     "enableBackwardBranches": "jiterpreter-backward-branch-entries-enabled",
     "enableCallResume": "jiterpreter-call-resume-enabled",
     "enableWasmEh": "jiterpreter-wasm-eh-enabled",
+    "enableSimd": "jiterpreter-simd-enabled",
     "enableStats": "jiterpreter-stats-enabled",
     "disableHeuristic": "jiterpreter-disable-heuristic",
     "estimateHeat": "jiterpreter-estimate-heat",
index 0f80f06..cdce074 100644 (file)
@@ -2,24 +2,28 @@
 // The .NET Foundation licenses this file to you under the MIT license.
 
 import { mono_assert, MonoMethod } from "./types";
+import { Module } from "./globals";
 import { NativePointer } from "./types/emscripten";
 import {
     getU16, getI16,
     getU32_unaligned, getI32_unaligned, getF32_unaligned, getF64_unaligned,
 } from "./memory";
-import { WasmOpcode } from "./jiterpreter-opcodes";
-import { MintOpcode, OpcodeInfo } from "./mintops";
+import { WasmOpcode, WasmSimdOpcode } from "./jiterpreter-opcodes";
+import {
+    MintOpcode, OpcodeInfo, SimdInfo,
+    SimdIntrinsic2, SimdIntrinsic3, SimdIntrinsic4
+} from "./mintops";
 import cwraps from "./cwraps";
 import {
     MintOpcodePtr, WasmValtype, WasmBuilder,
     append_memset_dest, append_bailout, append_exit,
     append_memmove_dest_src, try_append_memset_fast,
-    try_append_memmove_fast, counters,
+    try_append_memmove_fast, counters, bytesFromHex,
     getMemberOffset, JiterpMember, BailoutReason,
     getOpcodeTableValue
 } from "./jiterpreter-support";
 import {
-    sizeOfDataItem,
+    sizeOfDataItem, sizeOfV128, sizeOfStackval,
 
     disabledOpcodes, countCallTargets,
     callTargetCounts, trapTraceErrors,
@@ -158,7 +162,7 @@ export function generateWasmBody (
 ) : number {
     const abort = <MintOpcodePtr><any>0;
     let isFirstInstruction = true, isConditionallyExecuted = false,
-        firstOpcodeInBlock = true;
+        firstOpcodeInBlock = true, containsSimd = false;
     let result = 0,
         prologueOpcodeCounter = 0,
         conditionalOpcodeCounter = 0;
@@ -203,9 +207,20 @@ export function generateWasmBody (
 
         let opcode = getU16(ip);
         const info = OpcodeInfo[opcode];
+        const isSimdIntrins = (opcode >= MintOpcode.MINT_SIMD_INTRINS_P_P) &&
+            (opcode <= MintOpcode.MINT_SIMD_INTRINS_P_PPP);
+        const simdIntrinsArgCount = isSimdIntrins
+            ? opcode - MintOpcode.MINT_SIMD_INTRINS_P_P + 2
+            : 0;
+        const simdIntrinsIndex = isSimdIntrins
+            ? getArgU16(ip, 1 + simdIntrinsArgCount)
+            : 0;
+
         mono_assert(info, () => `invalid opcode ${opcode}`);
 
-        const opname = info[0];
+        const opname = isSimdIntrins
+            ? SimdInfo[simdIntrinsArgCount][simdIntrinsIndex]
+            : info[0];
         const _ip = ip;
         const isBackBranchTarget = builder.options.noExitBackwardBranches &&
             is_backward_branch_target(ip, startOfBody, backwardBranchTable),
@@ -1293,6 +1308,14 @@ export function generateWasmBody (
                         append_exit(builder, ip, exitOpcodeCounter, BailoutReason.ComplexBranch);
                     } else
                         ip = abort;
+                } else if (
+                    (opcode >= MintOpcode.MINT_SIMD_V128_LDC) &&
+                    (opcode <= MintOpcode.MINT_SIMD_INTRINS_P_PPP)
+                ) {
+                    if (!emit_simd(builder, ip, opcode, opname, simdIntrinsArgCount, simdIntrinsIndex))
+                        ip = abort;
+                    else
+                        containsSimd = true;
                 } else if (opcodeValue === 0) {
                     // This means it was explicitly marked as no-value in the opcode value table
                     //  so we can just skip over it. This is done for things like nops.
@@ -1376,6 +1399,11 @@ export function generateWasmBody (
 
     // console.log(`estimated size: ${builder.size + builder.cfg.overheadBytes + builder.bytesGeneratedSoFar}`);
 
+    // HACK: Traces containing simd will be *much* shorter than non-simd traces,
+    //  which will cause both the heuristic and our length requirement outside
+    //  to reject them. For now, just add a big constant to the length
+    if (containsSimd)
+        result += 10240;
     return result;
 }
 
@@ -1404,12 +1432,16 @@ function append_branch_target_block (builder: WasmBuilder, ip: MintOpcodePtr, is
     builder.cfg.startBranchBlock(ip, isBackBranchTarget);
 }
 
-function append_ldloc (builder: WasmBuilder, offset: number, opcode: WasmOpcode) {
+function append_ldloc (builder: WasmBuilder, offset: number, opcodeOrPrefix: WasmOpcode, simdOpcode?: WasmSimdOpcode) {
     builder.local("pLocals");
-    builder.appendU8(opcode);
+    builder.appendU8(opcodeOrPrefix);
+    if (simdOpcode !== undefined) {
+        // This looks wrong but I assure you it's correct.
+        builder.appendULeb(simdOpcode);
+    }
     // stackval is 8 bytes, but pLocals might not be 8 byte aligned so we use 4
     // wasm spec prohibits alignment higher than natural alignment, just to be annoying
-    const alignment = (opcode > WasmOpcode.f64_load) ? 0 : 2;
+    const alignment = (simdOpcode !== undefined) || (opcodeOrPrefix > WasmOpcode.f64_load) ? 0 : 2;
     builder.appendMemarg(offset, alignment);
 }
 
@@ -1418,11 +1450,15 @@ function append_ldloc (builder: WasmBuilder, offset: number, opcode: WasmOpcode)
 //  where the offset+alignment pair is referred to as a 'memarg' by the spec.
 // The actual store operation is equivalent to `pBase[offset] = value` (alignment has no
 //  observable impact on behavior, other than causing compilation failures if out of range)
-function append_stloc_tail (builder: WasmBuilder, offset: number, opcode: WasmOpcode) {
-    builder.appendU8(opcode);
+function append_stloc_tail (builder: WasmBuilder, offset: number, opcodeOrPrefix: WasmOpcode, simdOpcode?: WasmSimdOpcode) {
+    builder.appendU8(opcodeOrPrefix);
+    if (simdOpcode !== undefined) {
+        // This looks wrong but I assure you it's correct.
+        builder.appendULeb(simdOpcode);
+    }
     // stackval is 8 bytes, but pLocals might not be 8 byte aligned so we use 4
     // wasm spec prohibits alignment higher than natural alignment, just to be annoying
-    const alignment = (opcode > WasmOpcode.f64_store) ? 0 : 2;
+    const alignment = (simdOpcode !== undefined) || (opcodeOrPrefix > WasmOpcode.f64_store) ? 0 : 2;
     builder.appendMemarg(offset, alignment);
     invalidate_local(offset);
 }
@@ -1759,13 +1795,13 @@ function emit_fieldop (
         case MintOpcode.MINT_STFLD_O: {
             /*
              * Writing a ref-type field has to call an import to perform the write barrier anyway,
-             *  and technically it should use a different kind of barrier from copy_pointer. So
+             *  and technically it should use a different kind of barrier from copy_ptr. So
              *  we define a special import that is responsible for performing the whole stfld_o
              *  operation with as little trace-side overhead as possible
              * Previously the pseudocode looked like:
              *  cknull_ptr = *(MonoObject *)&locals[objectOffset];
              *  if (!cknull_ptr) bailout;
-             *  copy_pointer(cknull_ptr + fieldOffset, *(MonoObject *)&locals[localOffset])
+             *  copy_ptr(cknull_ptr + fieldOffset, *(MonoObject *)&locals[localOffset])
              * The null check optimization also allows us to safely omit the bailout check
              *  if we know that the target object isn't null. Even if the target object were
              *  somehow null in this case (bad! shouldn't be possible!) it won't be a crash
@@ -1938,7 +1974,7 @@ function emit_sfieldop (
             // src
             append_ldloca(builder, localOffset, 0);
             // FIXME: Use mono_gc_wbarrier_set_field_internal
-            builder.callImport("copy_pointer");
+            builder.callImport("copy_ptr");
             return true;
         case MintOpcode.MINT_LDSFLD_VT: {
             const sizeBytes = getArgU16(ip, 4);
@@ -2048,7 +2084,7 @@ const unopTable : { [opcode: number]: OpRec3 | undefined } = {
     [MintOpcode.MINT_POPCNT_I4]:      [WasmOpcode.i32_popcnt,    WasmOpcode.i32_load, WasmOpcode.i32_store],
     [MintOpcode.MINT_CLZ_I8]:         [WasmOpcode.i64_clz,       WasmOpcode.i64_load, WasmOpcode.i64_store],
     [MintOpcode.MINT_CTZ_I8]:         [WasmOpcode.i64_ctz,       WasmOpcode.i64_load, WasmOpcode.i64_store],
-    [MintOpcode.MINT_POPCNT_I8]:      [WasmOpcode.i64_popcnt,    WasmOpcode.i32_load, WasmOpcode.i32_store],
+    [MintOpcode.MINT_POPCNT_I8]:      [WasmOpcode.i64_popcnt,    WasmOpcode.i64_load, WasmOpcode.i64_store],
 };
 
 // HACK: Generating correct wasm for these is non-trivial so we hand them off to C.
@@ -3023,7 +3059,7 @@ function emit_indirectop (builder: WasmBuilder, ip: MintOpcodePtr, opcode: MintO
         builder.local("cknull_ptr");
         // Load address of value so that copy_managed_pointer can grab it
         append_ldloca(builder, valueVarIndex, 0);
-        builder.callImport("copy_pointer");
+        builder.callImport("copy_ptr");
     } else {
         // Pre-load address for the store operation
         builder.local("cknull_ptr");
@@ -3237,6 +3273,331 @@ function emit_arrayop (builder: WasmBuilder, frame: NativePointer, ip: MintOpcod
     return true;
 }
 
+const vec128Test =
+    "0061736d0100000001040160000003020100070801047465737400000a090107004100fd111a0b";
+let wasmSimdSupported : boolean | undefined;
+
+function getIsWasmSimdSupported () : boolean {
+    if (wasmSimdSupported !== undefined)
+        return wasmSimdSupported;
+
+    // Probe whether the current environment can handle wasm v128 opcodes.
+    try {
+        // Load and compile a test module that uses i32x4.splat. See wasm-simd-feature-detect.wat/wasm
+        const bytes = bytesFromHex(vec128Test);
+        counters.bytesGenerated += bytes.length;
+        new WebAssembly.Module(bytes);
+        wasmSimdSupported = true;
+    } catch (exc) {
+        console.log("MONO_WASM: Disabling WASM SIMD support due to JIT failure", exc);
+        wasmSimdSupported = false;
+    }
+
+    return wasmSimdSupported;
+}
+
+function get_import_name (
+    builder: WasmBuilder, typeName: string,
+    functionPtr: number
+) : string {
+    const name = `${typeName}_${functionPtr.toString(16)}`;
+    if (typeof (builder.importedFunctions[name]) !== "object")
+        builder.defineImportedFunction("s", name, typeName, false, functionPtr);
+
+    return name;
+}
+
+const simdCreateSizes = {
+    [MintOpcode.MINT_SIMD_V128_I1_CREATE]: 1,
+    [MintOpcode.MINT_SIMD_V128_I2_CREATE]: 2,
+    [MintOpcode.MINT_SIMD_V128_I4_CREATE]: 4,
+    [MintOpcode.MINT_SIMD_V128_I8_CREATE]: 8,
+};
+
+const simdCreateLoadOps = {
+    [MintOpcode.MINT_SIMD_V128_I1_CREATE]: WasmOpcode.i32_load8_s,
+    [MintOpcode.MINT_SIMD_V128_I2_CREATE]: WasmOpcode.i32_load16_s,
+    [MintOpcode.MINT_SIMD_V128_I4_CREATE]: WasmOpcode.i32_load,
+    [MintOpcode.MINT_SIMD_V128_I8_CREATE]: WasmOpcode.i64_load,
+};
+
+const simdCreateStoreOps = {
+    [MintOpcode.MINT_SIMD_V128_I1_CREATE]: WasmOpcode.i32_store8,
+    [MintOpcode.MINT_SIMD_V128_I2_CREATE]: WasmOpcode.i32_store16,
+    [MintOpcode.MINT_SIMD_V128_I4_CREATE]: WasmOpcode.i32_store,
+    [MintOpcode.MINT_SIMD_V128_I8_CREATE]: WasmOpcode.i64_store,
+};
+
+function emit_simd (
+    builder: WasmBuilder, ip: MintOpcodePtr,
+    opcode: MintOpcode, opname: string,
+    argCount: number, index: number
+) : boolean {
+    // First, if compiling an intrinsic attempt to emit the special vectorized implementation
+    // We only do this if SIMD is enabled since we'll be using the v128 opcodes.
+    if (builder.options.enableSimd && getIsWasmSimdSupported()) {
+        switch (argCount) {
+            case 2:
+                if (emit_simd_2(builder, ip, <SimdIntrinsic2>index))
+                    return true;
+                break;
+            case 3:
+                if (emit_simd_3(builder, ip, <SimdIntrinsic3>index))
+                    return true;
+                break;
+            case 4:
+                if (emit_simd_4(builder, ip, <SimdIntrinsic4>index))
+                    return true;
+                break;
+        }
+    }
+
+    // Fall back to a mix of non-vectorized wasm and the interpreter's implementation of the opcodes
+    switch (opcode) {
+        case MintOpcode.MINT_SIMD_V128_LDC: {
+            if (builder.options.enableSimd && getIsWasmSimdSupported()) {
+                builder.local("pLocals");
+                builder.appendSimd(WasmSimdOpcode.v128_const);
+                const view = Module.HEAPU8.slice(<any>ip + 4, <any>ip + 4 + sizeOfV128);
+                builder.appendBytes(view);
+                append_simd_store(builder, ip);
+            } else {
+                // dest
+                append_ldloca(builder, getArgU16(ip, 1), sizeOfV128);
+                // src (ip + 2)
+                builder.ptr_const(<any>ip + 4);
+                append_memmove_dest_src(builder, sizeOfV128);
+            }
+            return true;
+        }
+        case MintOpcode.MINT_SIMD_V128_I1_CREATE:
+        case MintOpcode.MINT_SIMD_V128_I2_CREATE:
+        case MintOpcode.MINT_SIMD_V128_I4_CREATE:
+        case MintOpcode.MINT_SIMD_V128_I8_CREATE: {
+            // These opcodes pack a series of locals into a vector
+            const elementSize = simdCreateSizes[opcode],
+                numElements = sizeOfV128 / elementSize,
+                destOffset = getArgU16(ip, 1),
+                srcOffset = getArgU16(ip, 2),
+                loadOp = simdCreateLoadOps[opcode],
+                storeOp = simdCreateStoreOps[opcode];
+            for (let i = 0; i < numElements; i++) {
+                builder.local("pLocals");
+                // load element from stack slot
+                append_ldloc(builder, srcOffset + (i * sizeOfStackval), loadOp);
+                // then store to destination element
+                append_stloc_tail(builder, destOffset + (i * elementSize), storeOp);
+            }
+            return true;
+        }
+        case MintOpcode.MINT_SIMD_INTRINS_P_P: {
+            counters.simdFallback[opname] = (counters.simdFallback[opname] || 0) + 1;
+            // res
+            append_ldloca(builder, getArgU16(ip, 1), sizeOfV128);
+            // src
+            append_ldloca(builder, getArgU16(ip, 2), 0);
+            const importName = get_import_name(builder, "simd_p_p", <any>cwraps.mono_jiterp_get_simd_intrinsic(1, index));
+            builder.callImport(importName);
+            return true;
+        }
+        case MintOpcode.MINT_SIMD_INTRINS_P_PP: {
+            counters.simdFallback[opname] = (counters.simdFallback[opname] || 0) + 1;
+            // res
+            append_ldloca(builder, getArgU16(ip, 1), sizeOfV128);
+            // src
+            append_ldloca(builder, getArgU16(ip, 2), 0);
+            append_ldloca(builder, getArgU16(ip, 3), 0);
+            const importName = get_import_name(builder, "simd_p_pp", <any>cwraps.mono_jiterp_get_simd_intrinsic(2, index));
+            builder.callImport(importName);
+            return true;
+        }
+        case MintOpcode.MINT_SIMD_INTRINS_P_PPP: {
+            counters.simdFallback[opname] = (counters.simdFallback[opname] || 0) + 1;
+            // res
+            append_ldloca(builder, getArgU16(ip, 1), sizeOfV128);
+            // src
+            append_ldloca(builder, getArgU16(ip, 2), 0);
+            append_ldloca(builder, getArgU16(ip, 3), 0);
+            append_ldloca(builder, getArgU16(ip, 4), 0);
+            const importName = get_import_name(builder, "simd_p_ppp", <any>cwraps.mono_jiterp_get_simd_intrinsic(3, index));
+            builder.callImport(importName);
+            return true;
+        }
+        default:
+            console.log(`MONO_WASM: jiterpreter emit_simd failed for ${opname}`);
+            return false;
+    }
+}
+
+function append_simd_store (builder: WasmBuilder, ip: MintOpcodePtr) {
+    append_stloc_tail(builder, getArgU16(ip, 1), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_store);
+}
+
+function append_simd_2_load (builder: WasmBuilder, ip: MintOpcodePtr, loadOp?: WasmSimdOpcode) {
+    builder.local("pLocals");
+    // This || is harmless since v128_load is 0
+    append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.PREFIX_simd, loadOp || WasmSimdOpcode.v128_load);
+}
+
+function append_simd_3_load (builder: WasmBuilder, ip: MintOpcodePtr) {
+    builder.local("pLocals");
+    append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load);
+    // FIXME: Can rhs be a scalar? We handle shifts separately already
+    append_ldloc(builder, getArgU16(ip, 3), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load);
+}
+
+function append_simd_4_load (builder: WasmBuilder, ip: MintOpcodePtr) {
+    builder.local("pLocals");
+    append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load);
+    append_ldloc(builder, getArgU16(ip, 3), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load);
+    append_ldloc(builder, getArgU16(ip, 4), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load);
+}
+
+const simdShiftTable = new Set<SimdIntrinsic3>([
+    SimdIntrinsic3.V128_I1_LEFT_SHIFT,
+    SimdIntrinsic3.V128_I2_LEFT_SHIFT,
+    SimdIntrinsic3.V128_I4_LEFT_SHIFT,
+    SimdIntrinsic3.V128_I8_LEFT_SHIFT,
+
+    SimdIntrinsic3.V128_I1_RIGHT_SHIFT,
+    SimdIntrinsic3.V128_I2_RIGHT_SHIFT,
+    SimdIntrinsic3.V128_I4_RIGHT_SHIFT,
+
+    SimdIntrinsic3.V128_I1_URIGHT_SHIFT,
+    SimdIntrinsic3.V128_I2_URIGHT_SHIFT,
+    SimdIntrinsic3.V128_I4_URIGHT_SHIFT,
+    SimdIntrinsic3.V128_I8_URIGHT_SHIFT,
+]);
+
+function append_stloc_simd_zero (builder: WasmBuilder, offset: number) {
+    builder.local("pLocals");
+    builder.appendSimd(WasmSimdOpcode.v128_const);
+    builder.appendBytes(new Uint8Array(sizeOfV128));
+    append_stloc_tail(builder, offset, WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_store);
+}
+
+function emit_simd_2 (builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrinsic2) : boolean {
+    const simple = <WasmSimdOpcode>cwraps.mono_jiterp_get_simd_opcode(1, index);
+    if (simple) {
+        append_simd_2_load(builder, ip);
+        builder.appendSimd(simple);
+        append_simd_store(builder, ip);
+        return true;
+    }
+
+    switch (index) {
+        case SimdIntrinsic2.V128_I1_CREATE_SCALAR:
+            // Zero then write scalar component
+            builder.local("pLocals");
+            append_stloc_simd_zero(builder, getArgU16(ip, 1));
+            append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.i32_load8_s);
+            append_stloc_tail(builder, getArgU16(ip, 1), WasmOpcode.i32_store8);
+            return true;
+        case SimdIntrinsic2.V128_I2_CREATE_SCALAR:
+            // Zero then write scalar component
+            builder.local("pLocals");
+            append_stloc_simd_zero(builder, getArgU16(ip, 1));
+            append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.i32_load16_s);
+            append_stloc_tail(builder, getArgU16(ip, 1), WasmOpcode.i32_store16);
+            return true;
+        case SimdIntrinsic2.V128_I4_CREATE_SCALAR:
+            // Zero then write scalar component
+            builder.local("pLocals");
+            append_stloc_simd_zero(builder, getArgU16(ip, 1));
+            append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.i32_load);
+            append_stloc_tail(builder, getArgU16(ip, 1), WasmOpcode.i32_store);
+            return true;
+        case SimdIntrinsic2.V128_I8_CREATE_SCALAR:
+            // Zero then write scalar component
+            builder.local("pLocals");
+            append_stloc_simd_zero(builder, getArgU16(ip, 1));
+            append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.i64_load);
+            append_stloc_tail(builder, getArgU16(ip, 1), WasmOpcode.i64_store);
+            return true;
+
+        case SimdIntrinsic2.V128_I1_CREATE:
+            append_simd_2_load(builder, ip, WasmSimdOpcode.v128_load8_splat);
+            append_simd_store(builder, ip);
+            return true;
+        case SimdIntrinsic2.V128_I2_CREATE:
+            append_simd_2_load(builder, ip, WasmSimdOpcode.v128_load16_splat);
+            append_simd_store(builder, ip);
+            return true;
+        case SimdIntrinsic2.V128_I4_CREATE:
+            append_simd_2_load(builder, ip, WasmSimdOpcode.v128_load32_splat);
+            append_simd_store(builder, ip);
+            return true;
+        case SimdIntrinsic2.V128_I8_CREATE:
+            append_simd_2_load(builder, ip, WasmSimdOpcode.v128_load64_splat);
+            append_simd_store(builder, ip);
+            return true;
+
+        default:
+            return false;
+    }
+}
+
+function emit_simd_3 (builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrinsic3) : boolean {
+    const simple = <WasmSimdOpcode>cwraps.mono_jiterp_get_simd_opcode(2, index);
+    if (simple) {
+        const isShift = simdShiftTable.has(index);
+        if (isShift) {
+            builder.local("pLocals");
+            append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load);
+            append_ldloc(builder, getArgU16(ip, 3), WasmOpcode.i32_load);
+            builder.appendSimd(simple);
+            append_simd_store(builder, ip);
+        } else {
+            append_simd_3_load(builder, ip);
+            builder.appendSimd(simple);
+            append_simd_store(builder, ip);
+        }
+        return true;
+    }
+
+    switch (index) {
+        case SimdIntrinsic3.V128_BITWISE_EQUALITY:
+        case SimdIntrinsic3.V128_BITWISE_INEQUALITY:
+            append_simd_3_load(builder, ip);
+            // FIXME: i64x2_ne and i64x2_any_true?
+            builder.appendSimd(WasmSimdOpcode.i64x2_eq);
+            builder.appendSimd(WasmSimdOpcode.i64x2_all_true);
+            if (index === SimdIntrinsic3.V128_BITWISE_INEQUALITY)
+                builder.appendU8(WasmOpcode.i32_eqz);
+            append_stloc_tail(builder, getArgU16(ip, 1), WasmOpcode.i32_store);
+            return true;
+        default:
+            return false;
+    }
+
+    return false;
+}
+
+function emit_simd_4 (builder: WasmBuilder, ip: MintOpcodePtr, index: SimdIntrinsic4) : boolean {
+    const simple = <WasmSimdOpcode>cwraps.mono_jiterp_get_simd_opcode(3, index);
+    if (simple) {
+        append_simd_4_load(builder, ip);
+        builder.appendSimd(simple);
+        append_simd_store(builder, ip);
+        return true;
+    }
+
+    switch (index) {
+        case SimdIntrinsic4.V128_CONDITIONAL_SELECT:
+            builder.local("pLocals");
+            // Wasm spec: result = ior𝑁(iand𝑁(𝑖1, π‘–3), iand𝑁(𝑖2, inot𝑁(𝑖3)))
+            // Our opcode: *arg0 = (*arg2 & *arg1) | (*arg3 & ~*arg1)
+            append_ldloc(builder, getArgU16(ip, 3), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load);
+            append_ldloc(builder, getArgU16(ip, 4), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load);
+            append_ldloc(builder, getArgU16(ip, 2), WasmOpcode.PREFIX_simd, WasmSimdOpcode.v128_load);
+            builder.appendSimd(WasmSimdOpcode.v128_bitselect);
+            append_simd_store(builder, ip);
+            return true;
+        default:
+            return false;
+    }
+}
+
 function append_safepoint (builder: WasmBuilder, ip: MintOpcodePtr) {
     // Check whether a safepoint is required
     builder.ptr_const(cwraps.mono_jiterp_get_polling_required_address());
index 1d35527..7c2e66b 100644 (file)
@@ -12,7 +12,8 @@ import { MintOpcode, OpcodeInfo } from "./mintops";
 import cwraps from "./cwraps";
 import {
     MintOpcodePtr, WasmValtype, WasmBuilder, addWasmFunctionPointer,
-    _now, elapsedTimes, counters, getRawCwrap, importDef,
+    _now, elapsedTimes,
+    counters, getRawCwrap, importDef,
     JiterpreterOptions, getOptions, recordFailure,
     JiterpMember, getMemberOffset,
     BailoutReasonNames, BailoutReason
@@ -138,6 +139,8 @@ export const traceInfo: { [key: string]: TraceInfo } = {};
 export const
     sizeOfDataItem = 4,
     sizeOfObjectHeader = 8,
+    sizeOfV128 = 16,
+    sizeOfStackval = 8,
     // While stats are enabled, dump concise stats every N traces so that it's clear a long-running
     //  task isn't frozen if it's jitting lots of traces
     autoDumpInterval = 500;
@@ -261,7 +264,7 @@ function getTraceImports() {
 
     traceImports = [
         importDef("bailout", recordBailout),
-        importDef("copy_pointer", getRawCwrap("mono_wasm_copy_managed_pointer")),
+        importDef("copy_ptr", getRawCwrap("mono_wasm_copy_managed_pointer")),
         importDef("entry", getRawCwrap("mono_jiterp_increase_entry_count")),
         importDef("value_copy", getRawCwrap("mono_jiterp_value_copy")),
         importDef("gettype", getRawCwrap("mono_jiterp_gettype_ref")),
@@ -376,8 +379,7 @@ function initialize_builder(builder: WasmBuilder) {
         WasmValtype.i32, true
     );
     builder.defineType(
-        "copy_pointer",
-        {
+        "copy_ptr", {
             "dest": WasmValtype.i32,
             "src": WasmValtype.i32
         },
@@ -693,13 +695,34 @@ function initialize_builder(builder: WasmBuilder) {
         },
         WasmValtype.i32, true
     );
+    builder.defineType(
+        "simd_p_p", {
+            "arg0": WasmValtype.i32,
+            "arg1": WasmValtype.i32,
+        }, WasmValtype.void, true
+    );
+    builder.defineType(
+        "simd_p_pp", {
+            "arg0": WasmValtype.i32,
+            "arg1": WasmValtype.i32,
+            "arg2": WasmValtype.i32,
+        }, WasmValtype.void, true
+    );
+    builder.defineType(
+        "simd_p_ppp", {
+            "arg0": WasmValtype.i32,
+            "arg1": WasmValtype.i32,
+            "arg2": WasmValtype.i32,
+            "arg3": WasmValtype.i32,
+        }, WasmValtype.void, true
+    );
 
     const traceImports = getTraceImports();
 
     // Pre-define function imports as persistent
     for (let i = 0; i < traceImports.length; i++) {
         mono_assert(traceImports[i], () => `trace #${i} missing`);
-        builder.defineImportedFunction("i", traceImports[i][0], traceImports[i][1], false, true, traceImports[i][2]);
+        builder.defineImportedFunction("i", traceImports[i][0], traceImports[i][1], true, traceImports[i][2]);
     }
 }
 
@@ -836,17 +859,15 @@ function generate_wasm(
         if (trace > 0)
             console.log(`${(<any>(builder.base)).toString(16)} ${methodFullName || traceName} generated ${buffer.length} byte(s) of wasm`);
         counters.bytesGenerated += buffer.length;
+
         if (buffer.length >= maxModuleSize) {
             console.warn(`MONO_WASM: Jiterpreter generated too much code (${buffer.length} bytes) for trace ${traceName}. Please report this issue.`);
             return 0;
         }
-        const traceModule = new WebAssembly.Module(buffer);
 
-        const traceInstance = new WebAssembly.Instance(traceModule, {
-            i: builder.getImportedFunctionTable(),
-            c: <any>builder.getConstants(),
-            m: { h: (<any>Module).asm.memory },
-        });
+        const traceModule = new WebAssembly.Module(buffer);
+        const wasmImports = builder.getWasmImports();
+        const traceInstance = new WebAssembly.Instance(traceModule, wasmImports);
 
         // Get the exported trace function
         const fn = traceInstance.exports[traceName];
@@ -907,7 +928,7 @@ function generate_wasm(
                     console.log(builder.traceBuf[i]);
             }
 
-            console.log(`// MONO_WASM: ${methodFullName || methodName}:${traceOffset.toString(16)} generated, blob follows //`);
+            console.log(`// MONO_WASM: ${methodFullName || traceName} generated, blob follows //`);
             let s = "", j = 0;
             try {
                 // We may have thrown an uncaught exception while inside a block,
@@ -1194,7 +1215,10 @@ export function jiterpreter_dump_stats(b?: boolean, concise?: boolean) {
             console.log(`// ${keys[i]}: ${abortCounts[keys[i]]} abort(s)`);
     }
 
-    if ((typeof (globalThis.setTimeout) === "function") && (b !== undefined))
+    for (const k in counters.simdFallback)
+        console.log(`// simd ${k}: ${counters.simdFallback[k]} fallback insn(s)`);
+
+    if ((typeof(globalThis.setTimeout) === "function") && (b !== undefined))
         setTimeout(
             () => jiterpreter_dump_stats(b),
             15000
diff --git a/src/mono/wasm/runtime/wasm-simd-feature-detect.wasm b/src/mono/wasm/runtime/wasm-simd-feature-detect.wasm
new file mode 100644 (file)
index 0000000..5d7c49d
Binary files /dev/null and b/src/mono/wasm/runtime/wasm-simd-feature-detect.wasm differ
diff --git a/src/mono/wasm/runtime/wasm-simd-feature-detect.wat b/src/mono/wasm/runtime/wasm-simd-feature-detect.wat
new file mode 100644 (file)
index 0000000..8cd56ad
--- /dev/null
@@ -0,0 +1,6 @@
+(module
+  (func $test (export "test")
+    (i32x4.splat (i32.const 0))
+    drop
+  )
+)
index cb76cb0..5bb5fb3 100644 (file)
@@ -25,7 +25,7 @@
   <PropertyGroup>
     <ICULibDir Condition="'$(MonoWasmThreads)' != 'true'">$([MSBuild]::NormalizeDirectory('$(PkgMicrosoft_NETCore_Runtime_ICU_Transport)', 'runtimes', 'browser-wasm', 'native', 'lib'))</ICULibDir>
     <ICULibDir Condition="'$(MonoWasmThreads)' == 'true'">$([MSBuild]::NormalizeDirectory('$(PkgMicrosoft_NETCore_Runtime_ICU_Transport)', 'runtimes', 'browser-wasm-threads', 'native', 'lib'))</ICULibDir>
-    <WasmEnableSIMD Condition="'$(WasmEnableSIMD)' == ''">false</WasmEnableSIMD>
+    <WasmEnableSIMD Condition="'$(WasmEnableSIMD)' == ''">true</WasmEnableSIMD>
     <WasmEnableLegacyJsInterop Condition="'$(WasmEnableLegacyJsInterop)' == ''">true</WasmEnableLegacyJsInterop>
     <FilterSystemTimeZones Condition="'$(FilterSystemTimeZones)' == ''">false</FilterSystemTimeZones>
     <EmccCmd>emcc</EmccCmd>
     <ItemGroup>
       <_EmccLinkFlags Include="-s INITIAL_MEMORY=$(EmccInitialHeapSize)" />
       <_EmccLinkFlags Include="-s STACK_SIZE=$(EmccStackSize)" />
-      <_EmccCommonFlags Condition="'$(WasmEnableSIMD)' == 'true'" Include="-msimd128" />
+      <_EmccCommonFlags Include="-msimd128" />
       <_EmccCommonFlags Condition="'$(MonoWasmThreads)' == 'true'" Include="-s USE_PTHREADS=1" />
       <_EmccLinkFlags Condition="'$(MonoWasmThreads)' == 'true'" Include="-Wno-pthreads-mem-growth" />
       <_EmccLinkFlags Condition="'$(MonoWasmThreads)' == 'true'" Include="-s PTHREAD_POOL_SIZE=0" />
   </Target>
 
   <Target Name="GenerateMintopsTS"
-          Inputs="$(MonoProjectRoot)wasm\runtime\genmintops.py;$(MonoProjectRoot)mono\mini\interp\mintops.def"
+          Inputs="$(MonoProjectRoot)wasm\runtime\genmintops.py;$(MonoProjectRoot)mono\mini\interp\mintops.def;$(MonoProjectRoot)mono\mini\interp\interp-simd-intrins.def"
           Outputs="$(NativeGeneratedFilesDir)mintops.ts">
-      <Exec Command="$(PythonCmd) $(MonoProjectRoot)wasm/runtime/genmintops.py $(MonoProjectRoot)mono/mini/interp/mintops.def $(NativeGeneratedFilesDir)mintops.ts" />
+      <Exec Command="$(PythonCmd) $(MonoProjectRoot)wasm/runtime/genmintops.py $(MonoProjectRoot)mono/mini/interp/mintops.def $(MonoProjectRoot)mono/mini/interp/interp-simd-intrins.def $(NativeGeneratedFilesDir)mintops.ts" />
   </Target>
 
   <Target Name="BuildWithRollup"