From: Thomas Lively Date: Wed, 29 Jul 2020 00:46:45 +0000 (-0700) Subject: [WebAssembly] Implement truncating vector stores X-Git-Tag: llvmorg-13-init~16497 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=ffd8c23ccb74bbd19ba16e3706a0c5e1e472cca6;p=platform%2Fupstream%2Fllvm.git [WebAssembly] Implement truncating vector stores Rather than expanding truncating stores so that vectors are stored one lane at a time, lower them to a sequence of instructions using narrowing operations instead, when possible. Since the narrowing operations have saturating semantics, but truncating stores require truncation, mask the stored value to manually truncate it before narrowing. Also, since narrowing is a binary operation, pass in the original vector as the unused second argument. Differential Revision: https://reviews.llvm.org/D84377 --- diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index c6519fa..d6197e9 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -246,6 +246,9 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( setLoadExtAction(Ext, MVT::v4i32, MVT::v4i16, Legal); setLoadExtAction(Ext, MVT::v2i64, MVT::v2i32, Legal); } + // And some truncating stores are legal as well + setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); } // Don't do anything clever with build_pairs diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index b603701..16bfc81 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -885,6 +885,12 @@ defm "" : SIMDConvert; defm "" : SIMDConvert; defm "" : SIMDConvert; +// Lower llvm.wasm.trunc.saturate.* to saturating instructions +def : Pat<(v4i32 (int_wasm_trunc_saturate_signed (v4f32 V128:$src))), + (fp_to_sint_v4i32_v4f32 (v4f32 V128:$src))>; +def : Pat<(v4i32 (int_wasm_trunc_saturate_unsigned (v4f32 V128:$src))), + (fp_to_uint_v4i32_v4f32 (v4f32 V128:$src))>; + // Widening operations multiclass SIMDWiden baseInst> { @@ -921,11 +927,95 @@ multiclass SIMDNarrow; defm "" : SIMDNarrow; -// Lower llvm.wasm.trunc.saturate.* to saturating instructions -def : Pat<(v4i32 (int_wasm_trunc_saturate_signed (v4f32 V128:$src))), - (fp_to_sint_v4i32_v4f32 (v4f32 V128:$src))>; -def : Pat<(v4i32 (int_wasm_trunc_saturate_unsigned (v4f32 V128:$src))), - (fp_to_uint_v4i32_v4f32 (v4f32 V128:$src))>; +// Use narrowing operations for truncating stores. Since the narrowing +// operations are saturating instead of truncating, we need to mask +// the stored values first. +// TODO: Use consts instead of splats +def store_v8i8_trunc_v8i16 : + OutPatFrag<(ops node:$val), + (EXTRACT_LANE_v2i64 + (NARROW_U_v16i8 + (AND_v4i32 (SPLAT_v4i32 (CONST_I32 0x00ff00ff)), node:$val), + node:$val // Unused input + ), + 0 + )>; + +def store_v4i16_trunc_v4i32 : + OutPatFrag<(ops node:$val), + (EXTRACT_LANE_v2i64 + (NARROW_U_v8i16 + (AND_v4i32 (SPLAT_v4i32 (CONST_I32 0x0000ffff)), node:$val), + node:$val // Unused input + ), + 0 + )>; + +// Store patterns adapted from WebAssemblyInstrMemory.td +multiclass NarrowingStorePatNoOffset { + def : Pat<(node ty:$val, I32:$addr), + (STORE_I64_A32 0, 0, I32:$addr, (i64 (out ty:$val)))>, + Requires<[HasAddr32]>; + def : Pat<(node ty:$val, I64:$addr), + (STORE_I64_A64 0, 0, I64:$addr, (i64 (out ty:$val)))>, + Requires<[HasAddr64]>; +} + +defm : NarrowingStorePatNoOffset; +defm : NarrowingStorePatNoOffset; + +multiclass NarrowingStorePatImmOff { + def : Pat<(kind ty:$val, (operand I32:$addr, imm:$off)), + (STORE_I64_A32 0, imm:$off, I32:$addr, (i64 (out ty:$val)))>, + Requires<[HasAddr32]>; + def : Pat<(kind ty:$val, (operand I64:$addr, imm:$off)), + (STORE_I64_A64 0, imm:$off, I64:$addr, (i64 (out ty:$val)))>, + Requires<[HasAddr64]>; +} + +defm : NarrowingStorePatImmOff; +defm : NarrowingStorePatImmOff; +defm : NarrowingStorePatImmOff; +defm : NarrowingStorePatImmOff; + +multiclass NarrowingStorePatOffsetOnly { + def : Pat<(kind ty:$val, imm:$off), + (STORE_I64_A32 0, imm:$off, (CONST_I32 0), (i64 (out ty:$val)))>, + Requires<[HasAddr32]>; + def : Pat<(kind ty:$val, imm:$off), + (STORE_I64_A64 0, imm:$off, (CONST_I64 0), (i64 (out ty:$val)))>, + Requires<[HasAddr64]>; +} + +defm : NarrowingStorePatOffsetOnly; +defm : NarrowingStorePatOffsetOnly; + +multiclass NarrowingStorePatGlobalAddrOffOnly { + def : Pat<(kind ty:$val, (WebAssemblywrapper tglobaladdr:$off)), + (STORE_I64_A32 + 0, tglobaladdr:$off, (CONST_I32 0), (i64 (out ty:$val)))>, + Requires<[IsNotPIC, HasAddr32]>; + def : Pat<(kind ty:$val, (WebAssemblywrapper tglobaladdr:$off)), + (STORE_I64_A64 + 0, tglobaladdr:$off, (CONST_I64 0), (i64 (out ty:$val)))>, + Requires<[IsNotPIC, HasAddr64]>; +} + +defm : NarrowingStorePatGlobalAddrOffOnly; +defm : NarrowingStorePatGlobalAddrOffOnly; // Bitcasts are nops // Matching bitcast t1 to t1 causes strange errors, so avoid repeating types diff --git a/llvm/test/CodeGen/WebAssembly/simd-offset.ll b/llvm/test/CodeGen/WebAssembly/simd-offset.ll index 8d39ddc..b2d3293 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-offset.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-offset.ll @@ -918,6 +918,24 @@ define void @store_v8i16(<8 x i16> %v, <8 x i16>* %p) { ret void } +define void @store_narrowing_v8i16(<8 x i8> %v, <8 x i8>* %p) { +; CHECK-LABEL: store_narrowing_v8i16: +; CHECK: .functype store_narrowing_v8i16 (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 16711935 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: # fallthrough-return + store <8 x i8> %v, <8 x i8>* %p + ret void +} + define void @store_v8i16_with_folded_offset(<8 x i16> %v, <8 x i16>* %p) { ; CHECK-LABEL: store_v8i16_with_folded_offset: ; CHECK: .functype store_v8i16_with_folded_offset (v128, i32) -> () @@ -933,6 +951,27 @@ define void @store_v8i16_with_folded_offset(<8 x i16> %v, <8 x i16>* %p) { ret void } +define void @store_narrowing_v8i16_with_folded_offset(<8 x i8> %v, <8 x i8>* %p) { +; CHECK-LABEL: store_narrowing_v8i16_with_folded_offset: +; CHECK: .functype store_narrowing_v8i16_with_folded_offset (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 16711935 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 16 +; CHECK-NEXT: # fallthrough-return + %q = ptrtoint <8 x i8>* %p to i32 + %r = add nuw i32 %q, 16 + %s = inttoptr i32 %r to <8 x i8>* + store <8 x i8> %v , <8 x i8>* %s + ret void +} + define void @store_v8i16_with_folded_gep_offset(<8 x i16> %v, <8 x i16>* %p) { ; CHECK-LABEL: store_v8i16_with_folded_gep_offset: ; CHECK: .functype store_v8i16_with_folded_gep_offset (v128, i32) -> () @@ -946,6 +985,25 @@ define void @store_v8i16_with_folded_gep_offset(<8 x i16> %v, <8 x i16>* %p) { ret void } +define void @store_narrowing_v8i16_with_folded_gep_offset(<8 x i8> %v, <8 x i8>* %p) { +; CHECK-LABEL: store_narrowing_v8i16_with_folded_gep_offset: +; CHECK: .functype store_narrowing_v8i16_with_folded_gep_offset (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 16711935 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 8 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr inbounds <8 x i8>, <8 x i8>* %p, i32 1 + store <8 x i8> %v , <8 x i8>* %s + ret void +} + define void @store_v8i16_with_unfolded_gep_negative_offset(<8 x i16> %v, <8 x i16>* %p) { ; CHECK-LABEL: store_v8i16_with_unfolded_gep_negative_offset: ; CHECK: .functype store_v8i16_with_unfolded_gep_negative_offset (v128, i32) -> () @@ -961,6 +1019,27 @@ define void @store_v8i16_with_unfolded_gep_negative_offset(<8 x i16> %v, <8 x i1 ret void } +define void @store_narrowing_v8i16_with_unfolded_gep_negative_offset(<8 x i8> %v, <8 x i8>* %p) { +; CHECK-LABEL: store_narrowing_v8i16_with_unfolded_gep_negative_offset: +; CHECK: .functype store_narrowing_v8i16_with_unfolded_gep_negative_offset (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const -8 +; CHECK-NEXT: i32.add +; CHECK-NEXT: i32.const 16711935 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr inbounds <8 x i8>, <8 x i8>* %p, i32 -1 + store <8 x i8> %v , <8 x i8>* %s + ret void +} + define void @store_v8i16_with_unfolded_offset(<8 x i16> %v, <8 x i16>* %p) { ; CHECK-LABEL: store_v8i16_with_unfolded_offset: ; CHECK: .functype store_v8i16_with_unfolded_offset (v128, i32) -> () @@ -978,6 +1057,29 @@ define void @store_v8i16_with_unfolded_offset(<8 x i16> %v, <8 x i16>* %p) { ret void } +define void @store_narrowing_v8i16_with_unfolded_offset(<8 x i8> %v, <8 x i8>* %p) { +; CHECK-LABEL: store_narrowing_v8i16_with_unfolded_offset: +; CHECK: .functype store_narrowing_v8i16_with_unfolded_offset (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: i32.const 16711935 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: # fallthrough-return + %q = ptrtoint <8 x i8>* %p to i32 + %r = add nsw i32 %q, 16 + %s = inttoptr i32 %r to <8 x i8>* + store <8 x i8> %v , <8 x i8>* %s + ret void +} + define void @store_v8i16_with_unfolded_gep_offset(<8 x i16> %v, <8 x i16>* %p) { ; CHECK-LABEL: store_v8i16_with_unfolded_gep_offset: ; CHECK: .functype store_v8i16_with_unfolded_gep_offset (v128, i32) -> () @@ -993,6 +1095,27 @@ define void @store_v8i16_with_unfolded_gep_offset(<8 x i16> %v, <8 x i16>* %p) { ret void } +define void @store_narrowing_v8i16_with_unfolded_gep_offset(<8 x i8> %v, <8 x i8>* %p) { +; CHECK-LABEL: store_narrowing_v8i16_with_unfolded_gep_offset: +; CHECK: .functype store_narrowing_v8i16_with_unfolded_gep_offset (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 8 +; CHECK-NEXT: i32.add +; CHECK-NEXT: i32.const 16711935 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr <8 x i8>, <8 x i8>* %p, i32 1 + store <8 x i8> %v , <8 x i8>* %s + ret void +} + define void @store_v8i16_to_numeric_address(<8 x i16> %v) { ; CHECK-LABEL: store_v8i16_to_numeric_address: ; CHECK: .functype store_v8i16_to_numeric_address (v128) -> () @@ -1006,6 +1129,25 @@ define void @store_v8i16_to_numeric_address(<8 x i16> %v) { ret void } +define void @store_narrowing_v8i16_to_numeric_address(<8 x i8> %v, <8 x i8>* %p) { +; CHECK-LABEL: store_narrowing_v8i16_to_numeric_address: +; CHECK: .functype store_narrowing_v8i16_to_numeric_address (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const 0 +; CHECK-NEXT: i32.const 16711935 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 32 +; CHECK-NEXT: # fallthrough-return + %s = inttoptr i32 32 to <8 x i8>* + store <8 x i8> %v , <8 x i8>* %s + ret void +} + define void @store_v8i16_to_global_address(<8 x i16> %v) { ; CHECK-LABEL: store_v8i16_to_global_address: ; CHECK: .functype store_v8i16_to_global_address (v128) -> () @@ -1018,6 +1160,24 @@ define void @store_v8i16_to_global_address(<8 x i16> %v) { ret void } +define void @store_narrowing_v8i16_to_global_address(<8 x i8> %v) { +; CHECK-LABEL: store_narrowing_v8i16_to_global_address: +; CHECK: .functype store_narrowing_v8i16_to_global_address (v128) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const 0 +; CHECK-NEXT: i32.const 16711935 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i8x16.narrow_i16x8_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store gv_v8i8 +; CHECK-NEXT: # fallthrough-return + store <8 x i8> %v , <8 x i8>* @gv_v8i8 + ret void +} + ; ============================================================================== ; 4 x i32 ; ============================================================================== @@ -1588,6 +1748,24 @@ define void @store_v4i32(<4 x i32> %v, <4 x i32>* %p) { ret void } +define void @store_narrowing_v4i32(<4 x i16> %v, <4 x i16>* %p) { +; CHECK-LABEL: store_narrowing_v4i32: +; CHECK: .functype store_narrowing_v4i32 (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 65535 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: # fallthrough-return + store <4 x i16> %v , <4 x i16>* %p + ret void +} + define void @store_v4i32_with_folded_offset(<4 x i32> %v, <4 x i32>* %p) { ; CHECK-LABEL: store_v4i32_with_folded_offset: ; CHECK: .functype store_v4i32_with_folded_offset (v128, i32) -> () @@ -1603,6 +1781,27 @@ define void @store_v4i32_with_folded_offset(<4 x i32> %v, <4 x i32>* %p) { ret void } +define void @store_narrowing_v4i32_with_folded_offset(<4 x i16> %v, <4 x i16>* %p) { +; CHECK-LABEL: store_narrowing_v4i32_with_folded_offset: +; CHECK: .functype store_narrowing_v4i32_with_folded_offset (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 65535 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 16 +; CHECK-NEXT: # fallthrough-return + %q = ptrtoint <4 x i16>* %p to i32 + %r = add nuw i32 %q, 16 + %s = inttoptr i32 %r to <4 x i16>* + store <4 x i16> %v , <4 x i16>* %s + ret void +} + define void @store_v4i32_with_folded_gep_offset(<4 x i32> %v, <4 x i32>* %p) { ; CHECK-LABEL: store_v4i32_with_folded_gep_offset: ; CHECK: .functype store_v4i32_with_folded_gep_offset (v128, i32) -> () @@ -1616,6 +1815,25 @@ define void @store_v4i32_with_folded_gep_offset(<4 x i32> %v, <4 x i32>* %p) { ret void } +define void @store_narrowing_v4i32_with_folded_gep_offset(<4 x i16> %v, <4 x i16>* %p) { +; CHECK-LABEL: store_narrowing_v4i32_with_folded_gep_offset: +; CHECK: .functype store_narrowing_v4i32_with_folded_gep_offset (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 65535 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 8 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr inbounds <4 x i16>, <4 x i16>* %p, i32 1 + store <4 x i16> %v , <4 x i16>* %s + ret void +} + define void @store_v4i32_with_unfolded_gep_negative_offset(<4 x i32> %v, <4 x i32>* %p) { ; CHECK-LABEL: store_v4i32_with_unfolded_gep_negative_offset: ; CHECK: .functype store_v4i32_with_unfolded_gep_negative_offset (v128, i32) -> () @@ -1631,6 +1849,27 @@ define void @store_v4i32_with_unfolded_gep_negative_offset(<4 x i32> %v, <4 x i3 ret void } +define void @store_narrowing_v4i32_with_unfolded_gep_negative_offset(<4 x i16> %v, <4 x i16>* %p) { +; CHECK-LABEL: store_narrowing_v4i32_with_unfolded_gep_negative_offset: +; CHECK: .functype store_narrowing_v4i32_with_unfolded_gep_negative_offset (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const -8 +; CHECK-NEXT: i32.add +; CHECK-NEXT: i32.const 65535 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr inbounds <4 x i16>, <4 x i16>* %p, i32 -1 + store <4 x i16> %v , <4 x i16>* %s + ret void +} + define void @store_v4i32_with_unfolded_offset(<4 x i32> %v, <4 x i32>* %p) { ; CHECK-LABEL: store_v4i32_with_unfolded_offset: ; CHECK: .functype store_v4i32_with_unfolded_offset (v128, i32) -> () @@ -1648,6 +1887,29 @@ define void @store_v4i32_with_unfolded_offset(<4 x i32> %v, <4 x i32>* %p) { ret void } +define void @store_narrowing_v4i32_with_unfolded_offset(<4 x i16> %v, <4 x i16>* %p) { +; CHECK-LABEL: store_narrowing_v4i32_with_unfolded_offset: +; CHECK: .functype store_narrowing_v4i32_with_unfolded_offset (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: i32.const 65535 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: # fallthrough-return + %q = ptrtoint <4 x i16>* %p to i32 + %r = add nsw i32 %q, 16 + %s = inttoptr i32 %r to <4 x i16>* + store <4 x i16> %v , <4 x i16>* %s + ret void +} + define void @store_v4i32_with_unfolded_gep_offset(<4 x i32> %v, <4 x i32>* %p) { ; CHECK-LABEL: store_v4i32_with_unfolded_gep_offset: ; CHECK: .functype store_v4i32_with_unfolded_gep_offset (v128, i32) -> () @@ -1663,6 +1925,27 @@ define void @store_v4i32_with_unfolded_gep_offset(<4 x i32> %v, <4 x i32>* %p) { ret void } +define void @store_narrowing_v4i32_with_unfolded_gep_offset(<4 x i16> %v, <4 x i16>* %p) { +; CHECK-LABEL: store_narrowing_v4i32_with_unfolded_gep_offset: +; CHECK: .functype store_narrowing_v4i32_with_unfolded_gep_offset (v128, i32) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 8 +; CHECK-NEXT: i32.add +; CHECK-NEXT: i32.const 65535 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: # fallthrough-return + %s = getelementptr <4 x i16>, <4 x i16>* %p, i32 1 + store <4 x i16> %v , <4 x i16>* %s + ret void +} + define void @store_v4i32_to_numeric_address(<4 x i32> %v) { ; CHECK-LABEL: store_v4i32_to_numeric_address: ; CHECK: .functype store_v4i32_to_numeric_address (v128) -> () @@ -1676,6 +1959,25 @@ define void @store_v4i32_to_numeric_address(<4 x i32> %v) { ret void } +define void @store_narrowing_v4i32_to_numeric_address(<4 x i16> %v) { +; CHECK-LABEL: store_narrowing_v4i32_to_numeric_address: +; CHECK: .functype store_narrowing_v4i32_to_numeric_address (v128) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const 0 +; CHECK-NEXT: i32.const 65535 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store 32 +; CHECK-NEXT: # fallthrough-return + %s = inttoptr i32 32 to <4 x i16>* + store <4 x i16> %v , <4 x i16>* %s + ret void +} + define void @store_v4i32_to_global_address(<4 x i32> %v) { ; CHECK-LABEL: store_v4i32_to_global_address: ; CHECK: .functype store_v4i32_to_global_address (v128) -> () @@ -1688,6 +1990,24 @@ define void @store_v4i32_to_global_address(<4 x i32> %v) { ret void } +define void @store_narrowing_v4i32_to_global_address(<4 x i16> %v) { +; CHECK-LABEL: store_narrowing_v4i32_to_global_address: +; CHECK: .functype store_narrowing_v4i32_to_global_address (v128) -> () +; CHECK-NEXT: # %bb.0: +; CHECK-NEXT: i32.const 0 +; CHECK-NEXT: i32.const 65535 +; CHECK-NEXT: i32x4.splat +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: v128.and +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i16x8.narrow_i32x4_u +; CHECK-NEXT: i64x2.extract_lane 0 +; CHECK-NEXT: i64.store gv_v4i16 +; CHECK-NEXT: # fallthrough-return + store <4 x i16> %v , <4 x i16>* @gv_v4i16 + ret void +} + ; ============================================================================== ; 2 x i64 ; ==============================================================================