From 6ad92200672de5a47588a828f6b21a071c0d1dcb Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Fri, 28 Sep 2018 17:09:51 +0000 Subject: [PATCH] [X86] Add the movbe instruction intrinsics from icc. These intrinsics exist in icc. They can be found on the Intel Intrinsics Guide website. All the backend support is in place to pattern match a load+bswap or a bswap+store pattern to the MOVBE instructions. So we just need to get the frontend to emit the correct IR. The pointer arguments in icc are declared as void so I had to jump through a packed struct to forcing a specific alignment on the load/store. Same trick we use in the unaligned vector load/store intrinsics Differential Revision: https://reviews.llvm.org/D52586 llvm-svn: 343343 --- clang/lib/Basic/Targets/X86.cpp | 3 ++ clang/lib/Headers/immintrin.h | 59 ++++++++++++++++++++++++ clang/test/CodeGen/movbe-builtins.c | 49 ++++++++++++++++++++ clang/test/Preprocessor/predefined-arch-macros.c | 32 +++++++++++++ 4 files changed, 143 insertions(+) create mode 100644 clang/test/CodeGen/movbe-builtins.c diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index 22a8b1e..94d1112 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -1081,6 +1081,9 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, if (HasMWAITX) Builder.defineMacro("__MWAITX__"); + if (HasMOVBE) + Builder.defineMacro("__MOVBE__"); + switch (XOPLevel) { case XOP: Builder.defineMacro("__XOP__"); diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h index e7bfbf9..7d0722e 100644 --- a/clang/lib/Headers/immintrin.h +++ b/clang/lib/Headers/immintrin.h @@ -306,6 +306,65 @@ _writegsbase_u64(unsigned long long __V) #endif #endif /* __FSGSBASE__ */ +#if !defined(_MSC_VER) || __has_feature(modules) || defined(__MOVBE__) + +/* The structs used below are to force the load/store to be unaligned. This + * is accomplished with the __packed__ attribute. The __may_alias__ prevents + * tbaa metadata from being generated based on the struct and the type of the + * field inside of it. + */ + +static __inline__ short __attribute__((__always_inline__, __nodebug__, __target__("movbe"))) +_loadbe_i16(void const * __P) { + struct __loadu_i16 { + short __v; + } __attribute__((__packed__, __may_alias__)); + return __builtin_bswap16(((struct __loadu_i16*)__P)->__v); +} + +static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe"))) +_storebe_i16(void * __P, short __D) { + struct __storeu_i16 { + short __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_i16*)__P)->__v = __builtin_bswap16(__D); +} + +static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("movbe"))) +_loadbe_i32(void const * __P) { + struct __loadu_i32 { + int __v; + } __attribute__((__packed__, __may_alias__)); + return __builtin_bswap32(((struct __loadu_i32*)__P)->__v); +} + +static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe"))) +_storebe_i32(void * __P, int __D) { + struct __storeu_i32 { + int __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_i32*)__P)->__v = __builtin_bswap32(__D); +} + +#ifdef __x86_64__ +static __inline__ long long __attribute__((__always_inline__, __nodebug__, __target__("movbe"))) +_loadbe_i64(void const * __P) { + struct __loadu_i64 { + long long __v; + } __attribute__((__packed__, __may_alias__)); + return __builtin_bswap64(((struct __loadu_i64*)__P)->__v); +} + +static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe"))) +_storebe_i64(void * __P, long long __D) { + struct __storeu_i64 { + long long __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_i64*)__P)->__v = __builtin_bswap64(__D); +} +#endif +#endif /* __MOVBE */ + #if !defined(_MSC_VER) || __has_feature(modules) || defined(__RTM__) #include #include diff --git a/clang/test/CodeGen/movbe-builtins.c b/clang/test/CodeGen/movbe-builtins.c new file mode 100644 index 0000000..342f663 --- /dev/null +++ b/clang/test/CodeGen/movbe-builtins.c @@ -0,0 +1,49 @@ +// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +movbe -emit-llvm -o - | FileCheck %s --check-prefixes=CHECK,CHECK-X64 +// RUN: %clang_cc1 -ffreestanding %s -triple=i686-apple-darwin -target-feature +movbe -emit-llvm -o - | FileCheck %s + + +#include + +short test_loadbe_i16(const short *P) { + // CHECK-LABEL: @test_loadbe_i16 + // CHECK: [[LOAD:%.*]] = load i16, i16* %{{.*}}, align 1 + // CHECK: call i16 @llvm.bswap.i16(i16 [[LOAD]]) + return _loadbe_i16(P); +} + +void test_storebe_i16(short *P, short D) { + // CHECK-LABEL: @test_storebe_i16 + // CHECK: [[DATA:%.*]] = call i16 @llvm.bswap.i16(i16 %{{.*}}) + // CHECK: store i16 [[DATA]], i16* %{{.*}}, align 1 + _storebe_i16(P, D); +} + +int test_loadbe_i32(const int *P) { + // CHECK-LABEL: @test_loadbe_i32 + // CHECK: [[LOAD:%.*]] = load i32, i32* %{{.*}}, align 1 + // CHECK: call i32 @llvm.bswap.i32(i32 [[LOAD]]) + return _loadbe_i32(P); +} + +void test_storebe_i32(int *P, int D) { + // CHECK-LABEL: @test_storebe_i32 + // CHECK: [[DATA:%.*]] = call i32 @llvm.bswap.i32(i32 %{{.*}}) + // CHECK: store i32 [[DATA]], i32* %{{.*}}, align 1 + _storebe_i32(P, D); +} + +#ifdef __x86_64__ +long long test_loadbe_i64(const long long *P) { + // CHECK-X64-LABEL: @test_loadbe_i64 + // CHECK-X64: [[LOAD:%.*]] = load i64, i64* %{{.*}}, align 1 + // CHECK-X64: call i64 @llvm.bswap.i64(i64 [[LOAD]]) + return _loadbe_i64(P); +} + +void test_storebe_i64(long long *P, long long D) { + // CHECK-X64-LABEL: @test_storebe_i64 + // CHECK-X64: [[DATA:%.*]] = call i64 @llvm.bswap.i64(i64 %{{.*}}) + // CHECK-X64: store i64 [[DATA]], i64* %{{.*}}, align 1 + _storebe_i64(P, D); +} +#endif diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c index 33e6b81..df61a65 100644 --- a/clang/test/Preprocessor/predefined-arch-macros.c +++ b/clang/test/Preprocessor/predefined-arch-macros.c @@ -524,6 +524,7 @@ // CHECK_CORE_AVX2_M32: #define __INVPCID__ 1 // CHECK_CORE_AVX2_M32: #define __LZCNT__ 1 // CHECK_CORE_AVX2_M32: #define __MMX__ 1 +// CHECK_CORE_AVX2_M32: #define __MOVBE__ 1 // CHECK_CORE_AVX2_M32: #define __PCLMUL__ 1 // CHECK_CORE_AVX2_M32: #define __POPCNT__ 1 // CHECK_CORE_AVX2_M32: #define __RDRND__ 1 @@ -554,6 +555,7 @@ // CHECK_CORE_AVX2_M64: #define __INVPCID__ 1 // CHECK_CORE_AVX2_M64: #define __LZCNT__ 1 // CHECK_CORE_AVX2_M64: #define __MMX__ 1 +// CHECK_CORE_AVX2_M64: #define __MOVBE__ 1 // CHECK_CORE_AVX2_M64: #define __PCLMUL__ 1 // CHECK_CORE_AVX2_M64: #define __POPCNT__ 1 // CHECK_CORE_AVX2_M64: #define __RDRND__ 1 @@ -588,6 +590,7 @@ // CHECK_BROADWELL_M32: #define __INVPCID__ 1 // CHECK_BROADWELL_M32: #define __LZCNT__ 1 // CHECK_BROADWELL_M32: #define __MMX__ 1 +// CHECK_BROADWELL_M32: #define __MOVBE__ 1 // CHECK_BROADWELL_M32: #define __PCLMUL__ 1 // CHECK_BROADWELL_M32: #define __POPCNT__ 1 // CHECK_BROADWELL_M32: #define __PRFCHW__ 1 @@ -621,6 +624,7 @@ // CHECK_BROADWELL_M64: #define __INVPCID__ 1 // CHECK_BROADWELL_M64: #define __LZCNT__ 1 // CHECK_BROADWELL_M64: #define __MMX__ 1 +// CHECK_BROADWELL_M64: #define __MOVBE__ 1 // CHECK_BROADWELL_M64: #define __PCLMUL__ 1 // CHECK_BROADWELL_M64: #define __POPCNT__ 1 // CHECK_BROADWELL_M64: #define __PRFCHW__ 1 @@ -659,6 +663,7 @@ // CHECK_SKL_M32: #define __INVPCID__ 1 // CHECK_SKL_M32: #define __LZCNT__ 1 // CHECK_SKL_M32: #define __MMX__ 1 +// CHECK_SKL_M32: #define __MOVBE__ 1 // CHECK_SKL_M32: #define __MPX__ 1 // CHECK_SKL_M32: #define __PCLMUL__ 1 // CHECK_SKL_M32: #define __POPCNT__ 1 @@ -694,6 +699,7 @@ // CHECK_SKL_M64: #define __INVPCID__ 1 // CHECK_SKL_M64: #define __LZCNT__ 1 // CHECK_SKL_M64: #define __MMX__ 1 +// CHECK_SKL_M64: #define __MOVBE__ 1 // CHECK_SKL_M64: #define __MPX__ 1 // CHECK_SKL_M64: #define __PCLMUL__ 1 // CHECK_SKL_M64: #define __POPCNT__ 1 @@ -735,6 +741,7 @@ // CHECK_KNL_M32: #define __FMA__ 1 // CHECK_KNL_M32: #define __LZCNT__ 1 // CHECK_KNL_M32: #define __MMX__ 1 +// CHECK_KNL_M32: #define __MOVBE__ 1 // CHECK_KNL_M32: #define __PCLMUL__ 1 // CHECK_KNL_M32: #define __POPCNT__ 1 // CHECK_KNL_M32: #define __PREFETCHWT1__ 1 @@ -772,6 +779,7 @@ // CHECK_KNL_M64: #define __FMA__ 1 // CHECK_KNL_M64: #define __LZCNT__ 1 // CHECK_KNL_M64: #define __MMX__ 1 +// CHECK_KNL_M64: #define __MOVBE__ 1 // CHECK_KNL_M64: #define __PCLMUL__ 1 // CHECK_KNL_M64: #define __POPCNT__ 1 // CHECK_KNL_M64: #define __PREFETCHWT1__ 1 @@ -813,6 +821,7 @@ // CHECK_KNM_M32: #define __FMA__ 1 // CHECK_KNM_M32: #define __LZCNT__ 1 // CHECK_KNM_M32: #define __MMX__ 1 +// CHECK_KNM_M32: #define __MOVBE__ 1 // CHECK_KNM_M32: #define __PCLMUL__ 1 // CHECK_KNM_M32: #define __POPCNT__ 1 // CHECK_KNM_M32: #define __PREFETCHWT1__ 1 @@ -848,6 +857,7 @@ // CHECK_KNM_M64: #define __FMA__ 1 // CHECK_KNM_M64: #define __LZCNT__ 1 // CHECK_KNM_M64: #define __MMX__ 1 +// CHECK_KNM_M64: #define __MOVBE__ 1 // CHECK_KNM_M64: #define __PCLMUL__ 1 // CHECK_KNM_M64: #define __POPCNT__ 1 // CHECK_KNM_M64: #define __PREFETCHWT1__ 1 @@ -889,6 +899,7 @@ // CHECK_SKX_M32: #define __INVPCID__ 1 // CHECK_SKX_M32: #define __LZCNT__ 1 // CHECK_SKX_M32: #define __MMX__ 1 +// CHECK_SKX_M32: #define __MOVBE__ 1 // CHECK_SKX_M32: #define __MPX__ 1 // CHECK_SKX_M32: #define __PCLMUL__ 1 // CHECK_SKX_M32: #define __PKU__ 1 @@ -935,6 +946,7 @@ // CHECK_SKX_M64: #define __INVPCID__ 1 // CHECK_SKX_M64: #define __LZCNT__ 1 // CHECK_SKX_M64: #define __MMX__ 1 +// CHECK_SKX_M64: #define __MOVBE__ 1 // CHECK_SKX_M64: #define __MPX__ 1 // CHECK_SKX_M64: #define __PCLMUL__ 1 // CHECK_SKX_M64: #define __PKU__ 1 @@ -986,6 +998,7 @@ // CHECK_CNL_M32: #define __INVPCID__ 1 // CHECK_CNL_M32: #define __LZCNT__ 1 // CHECK_CNL_M32: #define __MMX__ 1 +// CHECK_CNL_M32: #define __MOVBE__ 1 // CHECK_CNL_M32: #define __MPX__ 1 // CHECK_CNL_M32: #define __PCLMUL__ 1 // CHECK_CNL_M32: #define __PKU__ 1 @@ -1035,6 +1048,7 @@ // CHECK_CNL_M64: #define __INVPCID__ 1 // CHECK_CNL_M64: #define __LZCNT__ 1 // CHECK_CNL_M64: #define __MMX__ 1 +// CHECK_CNL_M64: #define __MOVBE__ 1 // CHECK_CNL_M64: #define __MPX__ 1 // CHECK_CNL_M64: #define __PCLMUL__ 1 // CHECK_CNL_M64: #define __PKU__ 1 @@ -1090,6 +1104,7 @@ // CHECK_ICL_M32: #define __INVPCID__ 1 // CHECK_ICL_M32: #define __LZCNT__ 1 // CHECK_ICL_M32: #define __MMX__ 1 +// CHECK_ICL_M32: #define __MOVBE__ 1 // CHECK_ICL_M32: #define __MPX__ 1 // CHECK_ICL_M32: #define __PCLMUL__ 1 // CHECK_ICL_M32: #define __PKU__ 1 @@ -1148,6 +1163,7 @@ // CHECK_ICL_M64: #define __INVPCID__ 1 // CHECK_ICL_M64: #define __LZCNT__ 1 // CHECK_ICL_M64: #define __MMX__ 1 +// CHECK_ICL_M64: #define __MOVBE__ 1 // CHECK_ICL_M64: #define __MPX__ 1 // CHECK_ICL_M64: #define __PCLMUL__ 1 // CHECK_ICL_M64: #define __PKU__ 1 @@ -1207,6 +1223,7 @@ // CHECK_ICX_M32: #define __INVPCID__ 1 // CHECK_ICX_M32: #define __LZCNT__ 1 // CHECK_ICX_M32: #define __MMX__ 1 +// CHECK_ICX_M32: #define __MOVBE__ 1 // CHECK_ICX_M32: #define __MPX__ 1 // CHECK_ICX_M32: #define __PCLMUL__ 1 // CHECK_ICX_M32: #define __PCONFIG__ 1 @@ -1266,6 +1283,7 @@ // CHECK_ICX_M64: #define __INVPCID__ 1 // CHECK_ICX_M64: #define __LZCNT__ 1 // CHECK_ICX_M64: #define __MMX__ 1 +// CHECK_ICX_M64: #define __MOVBE__ 1 // CHECK_ICX_M64: #define __MPX__ 1 // CHECK_ICX_M64: #define __PCLMUL__ 1 // CHECK_ICX_M64: #define __PCONFIG__ 1 @@ -1303,6 +1321,7 @@ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_ATOM_M32 // CHECK_ATOM_M32: #define __MMX__ 1 +// CHECK_ATOM_M32: #define __MOVBE__ 1 // CHECK_ATOM_M32: #define __SSE2__ 1 // CHECK_ATOM_M32: #define __SSE3__ 1 // CHECK_ATOM_M32: #define __SSE__ 1 @@ -1318,6 +1337,7 @@ // RUN: -target i386-unknown-linux \ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_ATOM_M64 // CHECK_ATOM_M64: #define __MMX__ 1 +// CHECK_ATOM_M64: #define __MOVBE__ 1 // CHECK_ATOM_M64: #define __SSE2_MATH__ 1 // CHECK_ATOM_M64: #define __SSE2__ 1 // CHECK_ATOM_M64: #define __SSE3__ 1 @@ -1340,6 +1360,7 @@ // CHECK_GLM_M32: #define __FSGSBASE__ 1 // CHECK_GLM_M32: #define __FXSR__ 1 // CHECK_GLM_M32: #define __MMX__ 1 +// CHECK_GLM_M32: #define __MOVBE__ 1 // CHECK_GLM_M32: #define __MPX__ 1 // CHECK_GLM_M32: #define __PCLMUL__ 1 // CHECK_GLM_M32: #define __POPCNT__ 1 @@ -1373,6 +1394,7 @@ // CHECK_GLM_M64: #define __FSGSBASE__ 1 // CHECK_GLM_M64: #define __FXSR__ 1 // CHECK_GLM_M64: #define __MMX__ 1 +// CHECK_GLM_M64: #define __MOVBE__ 1 // CHECK_GLM_M64: #define __MPX__ 1 // CHECK_GLM_M64: #define __PCLMUL__ 1 // CHECK_GLM_M64: #define __POPCNT__ 1 @@ -1404,6 +1426,7 @@ // CHECK_GLMP_M32: #define __FSGSBASE__ 1 // CHECK_GLMP_M32: #define __FXSR__ 1 // CHECK_GLMP_M32: #define __MMX__ 1 +// CHECK_GLMP_M32: #define __MOVBE__ 1 // CHECK_GLMP_M32: #define __MPX__ 1 // CHECK_GLMP_M32: #define __PCLMUL__ 1 // CHECK_GLMP_M32: #define __POPCNT__ 1 @@ -1440,6 +1463,7 @@ // CHECK_GLMP_M64: #define __FSGSBASE__ 1 // CHECK_GLMP_M64: #define __FXSR__ 1 // CHECK_GLMP_M64: #define __MMX__ 1 +// CHECK_GLMP_M64: #define __MOVBE__ 1 // CHECK_GLMP_M64: #define __MPX__ 1 // CHECK_GLMP_M64: #define __PCLMUL__ 1 // CHECK_GLMP_M64: #define __POPCNT__ 1 @@ -1476,6 +1500,7 @@ // CHECK_TRM_M32: #define __FXSR__ 1 // CHECK_TRM_M32: #define __GFNI__ 1 // CHECK_TRM_M32: #define __MMX__ 1 +// CHECK_TRM_M32: #define __MOVBE__ 1 // CHECK_TRM_M32: #define __MOVDIR64B__ 1 // CHECK_TRM_M32: #define __MOVDIRI__ 1 // CHECK_TRM_M32: #define __MPX__ 1 @@ -1517,6 +1542,7 @@ // CHECK_TRM_M64: #define __FXSR__ 1 // CHECK_TRM_M64: #define __GFNI__ 1 // CHECK_TRM_M64: #define __MMX__ 1 +// CHECK_TRM_M64: #define __MOVBE__ 1 // CHECK_TRM_M64: #define __MOVDIR64B__ 1 // CHECK_TRM_M64: #define __MOVDIRI__ 1 // CHECK_TRM_M64: #define __MPX__ 1 @@ -1551,6 +1577,7 @@ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_SLM_M32 // CHECK_SLM_M32: #define __FXSR__ 1 // CHECK_SLM_M32: #define __MMX__ 1 +// CHECK_SLM_M32: #define __MOVBE__ 1 // CHECK_SLM_M32: #define __PCLMUL__ 1 // CHECK_SLM_M32: #define __POPCNT__ 1 // CHECK_SLM_M32: #define __PRFCHW__ 1 @@ -1573,6 +1600,7 @@ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_SLM_M64 // CHECK_SLM_M64: #define __FXSR__ 1 // CHECK_SLM_M64: #define __MMX__ 1 +// CHECK_SLM_M64: #define __MOVBE__ 1 // CHECK_SLM_M64: #define __PCLMUL__ 1 // CHECK_SLM_M64: #define __POPCNT__ 1 // CHECK_SLM_M64: #define __PRFCHW__ 1 @@ -2134,6 +2162,7 @@ // CHECK_BTVER2_M32: #define __F16C__ 1 // CHECK_BTVER2_M32: #define __LZCNT__ 1 // CHECK_BTVER2_M32: #define __MMX__ 1 +// CHECK_BTVER2_M32: #define __MOVBE__ 1 // CHECK_BTVER2_M32: #define __PCLMUL__ 1 // CHECK_BTVER2_M32: #define __POPCNT__ 1 // CHECK_BTVER2_M32: #define __PRFCHW__ 1 @@ -2163,6 +2192,7 @@ // CHECK_BTVER2_M64: #define __F16C__ 1 // CHECK_BTVER2_M64: #define __LZCNT__ 1 // CHECK_BTVER2_M64: #define __MMX__ 1 +// CHECK_BTVER2_M64: #define __MOVBE__ 1 // CHECK_BTVER2_M64: #define __PCLMUL__ 1 // CHECK_BTVER2_M64: #define __POPCNT__ 1 // CHECK_BTVER2_M64: #define __PRFCHW__ 1 @@ -2491,6 +2521,7 @@ // CHECK_ZNVER1_M32: #define __FSGSBASE__ 1 // CHECK_ZNVER1_M32: #define __LZCNT__ 1 // CHECK_ZNVER1_M32: #define __MMX__ 1 +// CHECK_ZNVER1_M32: #define __MOVBE__ 1 // CHECK_ZNVER1_M32: #define __PCLMUL__ 1 // CHECK_ZNVER1_M32: #define __POPCNT__ 1 // CHECK_ZNVER1_M32: #define __PRFCHW__ 1 @@ -2534,6 +2565,7 @@ // CHECK_ZNVER1_M64: #define __FSGSBASE__ 1 // CHECK_ZNVER1_M64: #define __LZCNT__ 1 // CHECK_ZNVER1_M64: #define __MMX__ 1 +// CHECK_ZNVER1_M64: #define __MOVBE__ 1 // CHECK_ZNVER1_M64: #define __PCLMUL__ 1 // CHECK_ZNVER1_M64: #define __POPCNT__ 1 // CHECK_ZNVER1_M64: #define __PRFCHW__ 1 -- 2.7.4