From 716e8e6858222a1783bd62e10e7f26491cd2cd4f Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sat, 29 Sep 2018 17:49:42 +0000 Subject: [PATCH] [X86] Add more of the icc unaligned load/store to/from 128 bit vector intrinsics Summary: This patch adds _mm_loadu_si32 _mm_loadu_si16 _mm_storeu_si64 _mm_storeu_si32 _mm_storeu_si16 We already had _mm_load_si64. Reviewers: spatel, RKSimon Reviewed By: RKSimon Subscribers: cfe-commits Differential Revision: https://reviews.llvm.org/D52665 llvm-svn: 343388 --- clang/lib/Headers/emmintrin.h | 107 ++++++++++++++++++++++++++++++++++++- clang/test/CodeGen/sse2-builtins.c | 48 +++++++++++++++++ 2 files changed, 154 insertions(+), 1 deletion(-) diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h index f0ea7cd..6d61f97 100644 --- a/clang/lib/Headers/emmintrin.h +++ b/clang/lib/Headers/emmintrin.h @@ -1675,7 +1675,49 @@ _mm_loadu_si64(void const *__a) long long __v; } __attribute__((__packed__, __may_alias__)); long long __u = ((struct __loadu_si64*)__a)->__v; - return __extension__ (__m128i)(__v2di){__u, 0L}; + return __extension__ (__m128i)(__v2di){__u, 0LL}; +} + +/// Loads a 32-bit integer value to the low element of a 128-bit integer +/// vector and clears the upper element. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVD / MOVD instruction. +/// +/// \param __a +/// A pointer to a 32-bit memory location. The address of the memory +/// location does not have to be aligned. +/// \returns A 128-bit vector of [4 x i32] containing the loaded value. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_loadu_si32(void const *__a) +{ + struct __loadu_si32 { + int __v; + } __attribute__((__packed__, __may_alias__)); + int __u = ((struct __loadu_si32*)__a)->__v; + return __extension__ (__m128i)(__v4si){__u, 0, 0, 0}; +} + +/// Loads a 16-bit integer value to the low element of a 128-bit integer +/// vector and clears the upper element. +/// +/// \headerfile +/// +/// This intrinsic does not correspond to a specific instruction. +/// +/// \param __a +/// A pointer to a 16-bit memory location. The address of the memory +/// location does not have to be aligned. +/// \returns A 128-bit vector of [8 x i16] containing the loaded value. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_loadu_si16(void const *__a) +{ + struct __loadu_si16 { + short __v; + } __attribute__((__packed__, __may_alias__)); + short __u = ((struct __loadu_si16*)__a)->__v; + return __extension__ (__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0}; } /// Loads a 64-bit double-precision value to the low element of a @@ -3993,6 +4035,69 @@ _mm_storeu_si128(__m128i *__p, __m128i __b) ((struct __storeu_si128*)__p)->__v = __b; } +/// Stores a 64-bit integer value from the low element of a 128-bit integer +/// vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVQ / MOVQ instruction. +/// +/// \param __p +/// A pointer to a 64-bit memory location. The address of the memory +/// location does not have to be algned. +/// \param __b +/// A 128-bit integer vector containing the value to be stored. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_storeu_si64(void const *__p, __m128i __b) +{ + struct __storeu_si64 { + long long __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_si64*)__p)->__v = ((__v2di)__b)[0]; +} + +/// Stores a 32-bit integer value from the low element of a 128-bit integer +/// vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVD / MOVD instruction. +/// +/// \param __p +/// A pointer to a 32-bit memory location. The address of the memory +/// location does not have to be aligned. +/// \param __b +/// A 128-bit integer vector containing the value to be stored. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_storeu_si32(void const *__p, __m128i __b) +{ + struct __storeu_si32 { + int __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_si32*)__p)->__v = ((__v4si)__b)[0]; +} + +/// Stores a 16-bit integer value from the low element of a 128-bit integer +/// vector. +/// +/// \headerfile +/// +/// This intrinsic does not correspond to a specific instruction. +/// +/// \param __p +/// A pointer to a 16-bit memory location. The address of the memory +/// location does not have to be aligned. +/// \param __b +/// A 128-bit integer vector containing the value to be stored. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_storeu_si16(void const *__p, __m128i __b) +{ + struct __storeu_si16 { + short __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_si16*)__p)->__v = ((__v8hi)__b)[0]; +} + /// Moves bytes selected by the mask from the first operand to the /// specified unaligned memory location. When a mask bit is 1, the /// corresponding byte is written, otherwise it is not written. diff --git a/clang/test/CodeGen/sse2-builtins.c b/clang/test/CodeGen/sse2-builtins.c index ac22f5b..005bdfd 100644 --- a/clang/test/CodeGen/sse2-builtins.c +++ b/clang/test/CodeGen/sse2-builtins.c @@ -721,6 +721,30 @@ __m128i test_mm_loadu_si64(void const* A) { return _mm_loadu_si64(A); } +__m128i test_mm_loadu_si32(void const* A) { + // CHECK-LABEL: test_mm_loadu_si32 + // CHECK: load i32, i32* %{{.*}}, align 1{{$}} + // CHECK: insertelement <4 x i32> undef, i32 %{{.*}}, i32 0 + // CHECK: insertelement <4 x i32> %{{.*}}, i32 0, i32 1 + // CHECK: insertelement <4 x i32> %{{.*}}, i32 0, i32 2 + // CHECK: insertelement <4 x i32> %{{.*}}, i32 0, i32 3 + return _mm_loadu_si32(A); +} + +__m128i test_mm_loadu_si16(void const* A) { + // CHECK-LABEL: test_mm_loadu_si16 + // CHECK: load i16, i16* %{{.*}}, align 1{{$}} + // CHECK: insertelement <8 x i16> undef, i16 %{{.*}}, i32 0 + // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 1 + // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 2 + // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 3 + // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 4 + // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 5 + // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 6 + // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 7 + return _mm_loadu_si16(A); +} + __m128i test_mm_madd_epi16(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_madd_epi16 // CHECK: call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) @@ -1351,6 +1375,30 @@ void test_mm_storeu_si128(__m128i* A, __m128i B) { _mm_storeu_si128(A, B); } +void test_mm_storeu_si64(void* A, __m128i B) { + // CHECK-LABEL: test_mm_storeu_si64 + // CHECK: [[EXT:%.*]] = extractelement <2 x i64> %{{.*}}, i32 0 + // CHECK: store i64 [[EXT]], i64* %{{.*}}, align 1{{$}} + // CHECK-NEXT: ret void + _mm_storeu_si64(A, B); +} + +void test_mm_storeu_si32(void* A, __m128i B) { + // CHECK-LABEL: test_mm_storeu_si32 + // CHECK: [[EXT:%.*]] = extractelement <4 x i32> %{{.*}}, i32 0 + // CHECK: store i32 [[EXT]], i32* %{{.*}}, align 1{{$}} + // CHECK-NEXT: ret void + _mm_storeu_si32(A, B); +} + +void test_mm_storeu_si16(void* A, __m128i B) { + // CHECK-LABEL: test_mm_storeu_si16 + // CHECK: [[EXT:%.*]] = extractelement <8 x i16> %{{.*}}, i32 0 + // CHECK: store i16 [[EXT]], i16* %{{.*}}, align 1{{$}} + // CHECK-NEXT: ret void + _mm_storeu_si16(A, B); +} + void test_mm_stream_pd(double *A, __m128d B) { // CHECK-LABEL: test_mm_stream_pd // CHECK: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 16, !nontemporal -- 2.7.4