From b9ea36f9c1c716db1c98960b352961717bce1a19 Mon Sep 17 00:00:00 2001 From: Sam Parker Date: Thu, 4 May 2017 08:37:59 +0000 Subject: [PATCH] [ARM] ACLE Chapter 9 intrinsics Implemented the remaining integer data processing intrinsics from the ARM ACLE v2.1 spec, such as parallel arithemtic and DSP style multiplications. Differential Revision: https://reviews.llvm.org/D32282 llvm-svn: 302131 --- clang/include/clang/Basic/BuiltinsARM.def | 84 ++++++- clang/lib/Headers/arm_acle.h | 318 +++++++++++++++++++++++- clang/test/CodeGen/arm_acle.c | 393 +++++++++++++++++++++++++++++- 3 files changed, 788 insertions(+), 7 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsARM.def b/clang/include/clang/Basic/BuiltinsARM.def index 6cc7308..e8db347 100644 --- a/clang/include/clang/Basic/BuiltinsARM.def +++ b/clang/include/clang/Basic/BuiltinsARM.def @@ -25,11 +25,93 @@ // In libgcc BUILTIN(__clear_cache, "vv*v*", "i") +// 16-bit multiplications +BUILTIN(__builtin_arm_smulbb, "iii", "nc") +BUILTIN(__builtin_arm_smulbt, "iii", "nc") +BUILTIN(__builtin_arm_smultb, "iii", "nc") +BUILTIN(__builtin_arm_smultt, "iii", "nc") +BUILTIN(__builtin_arm_smulwb, "iii", "nc") +BUILTIN(__builtin_arm_smulwt, "iii", "nc") + // Saturating arithmetic BUILTIN(__builtin_arm_qadd, "iii", "nc") BUILTIN(__builtin_arm_qsub, "iii", "nc") BUILTIN(__builtin_arm_ssat, "iiUi", "nc") -BUILTIN(__builtin_arm_usat, "UiUiUi", "nc") +BUILTIN(__builtin_arm_usat, "UiiUi", "nc") + +BUILTIN(__builtin_arm_smlabb, "iiii", "nc") +BUILTIN(__builtin_arm_smlabt, "iiii", "nc") +BUILTIN(__builtin_arm_smlatb, "iiii", "nc") +BUILTIN(__builtin_arm_smlatt, "iiii", "nc") +BUILTIN(__builtin_arm_smlawb, "iiii", "nc") +BUILTIN(__builtin_arm_smlawt, "iiii", "nc") + +BUILTIN(__builtin_arm_ssat16, "iii", "nc") +BUILTIN(__builtin_arm_usat16, "iii", "nc") + +BUILTIN(__builtin_arm_sxtab16, "iii", "nc") +BUILTIN(__builtin_arm_sxtb16, "ii", "nc") +BUILTIN(__builtin_arm_uxtab16, "iii", "nc") +BUILTIN(__builtin_arm_uxtb16, "ii", "nc") + +BUILTIN(__builtin_arm_sel, "iii", "nc") + +BUILTIN(__builtin_arm_qadd8, "iii", "nc") +BUILTIN(__builtin_arm_qsub8, "iii", "nc") +BUILTIN(__builtin_arm_sadd8, "iii", "nc") +BUILTIN(__builtin_arm_shadd8, "iii", "nc") +BUILTIN(__builtin_arm_shsub8, "iii", "nc") +BUILTIN(__builtin_arm_ssub8, "iii", "nc") +BUILTIN(__builtin_arm_uadd8, "UiUiUi", "nc") +BUILTIN(__builtin_arm_uhadd8, "UiUiUi", "nc") +BUILTIN(__builtin_arm_uhsub8, "UiUiUi", "nc") +BUILTIN(__builtin_arm_uqadd8, "UiUiUi", "nc") +BUILTIN(__builtin_arm_uqsub8, "UiUiUi", "nc") +BUILTIN(__builtin_arm_usub8, "UiUiUi", "nc") + +// Sum of 8-bit absolute differences +BUILTIN(__builtin_arm_usad8, "UiUiUi", "nc") +BUILTIN(__builtin_arm_usada8, "UiUiUiUi", "nc") + +// Parallel 16-bit addition and subtraction +BUILTIN(__builtin_arm_qadd16, "iii", "nc") +BUILTIN(__builtin_arm_qasx, "iii", "nc") +BUILTIN(__builtin_arm_qsax, "iii", "nc") +BUILTIN(__builtin_arm_qsub16, "iii", "nc") +BUILTIN(__builtin_arm_sadd16, "iii", "nc") +BUILTIN(__builtin_arm_sasx, "iii", "nc") +BUILTIN(__builtin_arm_shadd16, "iii", "nc") +BUILTIN(__builtin_arm_shasx, "iii", "nc") +BUILTIN(__builtin_arm_shsax, "iii", "nc") +BUILTIN(__builtin_arm_shsub16, "iii", "nc") +BUILTIN(__builtin_arm_ssax, "iii", "nc") +BUILTIN(__builtin_arm_ssub16, "iii", "nc") +BUILTIN(__builtin_arm_uadd16, "UiUiUi", "nc") +BUILTIN(__builtin_arm_uasx, "UiUiUi", "nc") +BUILTIN(__builtin_arm_uhadd16, "UiUiUi", "nc") +BUILTIN(__builtin_arm_uhasx, "UiUiUi", "nc") +BUILTIN(__builtin_arm_uhsax, "UiUiUi", "nc") +BUILTIN(__builtin_arm_uhsub16, "UiUiUi", "nc") +BUILTIN(__builtin_arm_uqadd16, "UiUiUi", "nc") +BUILTIN(__builtin_arm_uqasx, "UiUiUi", "nc") +BUILTIN(__builtin_arm_uqsax, "UiUiUi", "nc") +BUILTIN(__builtin_arm_uqsub16, "UiUiUi", "nc") +BUILTIN(__builtin_arm_usax, "UiUiUi", "nc") +BUILTIN(__builtin_arm_usub16, "UiUiUi", "nc") + +// Parallel 16-bit multiplication +BUILTIN(__builtin_arm_smlad, "iiii", "nc") +BUILTIN(__builtin_arm_smladx, "iiii", "nc") +BUILTIN(__builtin_arm_smlald, "LLiiiLLi", "nc") +BUILTIN(__builtin_arm_smlaldx, "LLiiiLLi", "nc") +BUILTIN(__builtin_arm_smlsd, "iiii", "nc") +BUILTIN(__builtin_arm_smlsdx, "iiii", "nc") +BUILTIN(__builtin_arm_smlsld, "LLiiiLLi", "nc") +BUILTIN(__builtin_arm_smlsldx, "LLiiiLLi", "nc") +BUILTIN(__builtin_arm_smuad, "iii", "nc") +BUILTIN(__builtin_arm_smuadx, "iii", "nc") +BUILTIN(__builtin_arm_smusd, "iii", "nc") +BUILTIN(__builtin_arm_smusdx, "iii", "nc") // Bit manipulation BUILTIN(__builtin_arm_rbit, "UiUi", "nc") diff --git a/clang/lib/Headers/arm_acle.h b/clang/lib/Headers/arm_acle.h index 8423e62..ab25897 100644 --- a/clang/lib/Headers/arm_acle.h +++ b/clang/lib/Headers/arm_acle.h @@ -225,19 +225,49 @@ __rbitl(unsigned long __t) { } /* + * 9.3 16-bit multiplications + */ +#if __ARM_FEATURE_DSP +static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) +__smulbb(int32_t __a, int32_t __b) { + return __builtin_arm_smulbb(__a, __b); +} +static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) +__smulbt(int32_t __a, int32_t __b) { + return __builtin_arm_smulbt(__a, __b); +} +static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) +__smultb(int32_t __a, int32_t __b) { + return __builtin_arm_smultb(__a, __b); +} +static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) +__smultt(int32_t __a, int32_t __b) { + return __builtin_arm_smultt(__a, __b); +} +static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) +__smulwb(int32_t __a, int32_t __b) { + return __builtin_arm_smulwb(__a, __b); +} +static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) +__smulwt(int32_t __a, int32_t __b) { + return __builtin_arm_smulwt(__a, __b); +} +#endif + +/* * 9.4 Saturating intrinsics * * FIXME: Change guard to their corrosponding __ARM_FEATURE flag when Q flag * intrinsics are implemented and the flag is enabled. */ /* 9.4.1 Width-specified saturation intrinsics */ -#if __ARM_32BIT_STATE +#if __ARM_FEATURE_SAT #define __ssat(x, y) __builtin_arm_ssat(x, y) #define __usat(x, y) __builtin_arm_usat(x, y) #endif /* 9.4.2 Saturating addition and subtraction intrinsics */ -#if __ARM_32BIT_STATE +#if __ARM_FEATURE_DSP static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) __qadd(int32_t __t, int32_t __v) { return __builtin_arm_qadd(__t, __v); @@ -254,6 +284,290 @@ __qdbl(int32_t __t) { } #endif +/* 9.4.3 Accumultating multiplications */ +#if __ARM_FEATURE_DSP +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smlabb(int32_t __a, int32_t __b, int32_t __c) { + return __builtin_arm_smlabb(__a, __b, __c); +} +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smlabt(int32_t __a, int32_t __b, int32_t __c) { + return __builtin_arm_smlabt(__a, __b, __c); +} +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smlatb(int32_t __a, int32_t __b, int32_t __c) { + return __builtin_arm_smlatb(__a, __b, __c); +} +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smlatt(int32_t __a, int32_t __b, int32_t __c) { + return __builtin_arm_smlatt(__a, __b, __c); +} +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smlawb(int32_t __a, int32_t __b, int32_t __c) { + return __builtin_arm_smlawb(__a, __b, __c); +} +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smlawt(int32_t __a, int32_t __b, int32_t __c) { + return __builtin_arm_smlawt(__a, __b, __c); +} +#endif + + +/* 9.5.4 Parallel 16-bit saturation */ +#if __ARM_FEATURE_SIMD32 +#define __ssat16(x, y) __builtin_arm_ssat16(x, y) +#define __usat16(x, y) __builtin_arm_usat16(x, y) +#endif + +/* 9.5.5 Packing and unpacking */ +#if __ARM_FEATURE_SIMD32 +typedef int32_t int8x4_t; +typedef int32_t int16x2_t; +typedef uint32_t uint8x4_t; +typedef uint32_t uint16x2_t; + +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__sxtab16(int16x2_t __a, int8x4_t __b) { + return __builtin_arm_sxtab16(__a, __b); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__sxtb16(int8x4_t __a) { + return __builtin_arm_sxtb16(__a); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__uxtab16(int16x2_t __a, int8x4_t __b) { + return __builtin_arm_uxtab16(__a, __b); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__uxtb16(int8x4_t __a) { + return __builtin_arm_uxtb16(__a); +} +#endif + +/* 9.5.6 Parallel selection */ +#if __ARM_FEATURE_SIMD32 +static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) +__sel(uint8x4_t __a, uint8x4_t __b) { + return __builtin_arm_sel(__a, __b); +} +#endif + +/* 9.5.7 Parallel 8-bit addition and subtraction */ +#if __ARM_FEATURE_SIMD32 +static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) +__qadd8(int8x4_t __a, int8x4_t __b) { + return __builtin_arm_qadd8(__a, __b); +} +static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) +__qsub8(int8x4_t __a, int8x4_t __b) { + return __builtin_arm_qsub8(__a, __b); +} +static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) +__sadd8(int8x4_t __a, int8x4_t __b) { + return __builtin_arm_sadd8(__a, __b); +} +static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) +__shadd8(int8x4_t __a, int8x4_t __b) { + return __builtin_arm_shadd8(__a, __b); +} +static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) +__shsub8(int8x4_t __a, int8x4_t __b) { + return __builtin_arm_shsub8(__a, __b); +} +static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) +__ssub8(int8x4_t __a, int8x4_t __b) { + return __builtin_arm_ssub8(__a, __b); +} +static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) +__uadd8(uint8x4_t __a, uint8x4_t __b) { + return __builtin_arm_uadd8(__a, __b); +} +static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) +__uhadd8(uint8x4_t __a, uint8x4_t __b) { + return __builtin_arm_uhadd8(__a, __b); +} +static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) +__uhsub8(uint8x4_t __a, uint8x4_t __b) { + return __builtin_arm_uhsub8(__a, __b); +} +static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) +__uqadd8(uint8x4_t __a, uint8x4_t __b) { + return __builtin_arm_uqadd8(__a, __b); +} +static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) +__uqsub8(uint8x4_t __a, uint8x4_t __b) { + return __builtin_arm_uqsub8(__a, __b); +} +static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) +__usub8(uint8x4_t __a, uint8x4_t __b) { + return __builtin_arm_usub8(__a, __b); +} +#endif + +/* 9.5.8 Sum of 8-bit absolute differences */ +#if __ARM_FEATURE_SIMD32 +static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) +__usad8(uint8x4_t __a, uint8x4_t __b) { + return __builtin_arm_usad8(__a, __b); +} +static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) +__usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) { + return __builtin_arm_usada8(__a, __b, __c); +} +#endif + +/* 9.5.9 Parallel 16-bit addition and subtraction */ +#if __ARM_FEATURE_SIMD32 +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__qadd16(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_qadd16(__a, __b); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__qasx(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_qasx(__a, __b); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__qsax(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_qsax(__a, __b); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__qsub16(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_qsub16(__a, __b); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__sadd16(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_sadd16(__a, __b); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__sasx(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_sasx(__a, __b); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__shadd16(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_shadd16(__a, __b); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__shasx(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_shasx(__a, __b); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__shsax(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_shsax(__a, __b); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__shsub16(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_shsub16(__a, __b); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__ssax(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_ssax(__a, __b); +} +static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) +__ssub16(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_ssub16(__a, __b); +} +static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) +__uadd16(uint16x2_t __a, uint16x2_t __b) { + return __builtin_arm_uadd16(__a, __b); +} +static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) +__uasx(uint16x2_t __a, uint16x2_t __b) { + return __builtin_arm_uasx(__a, __b); +} +static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) +__uhadd16(uint16x2_t __a, uint16x2_t __b) { + return __builtin_arm_uhadd16(__a, __b); +} +static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) +__uhasx(uint16x2_t __a, uint16x2_t __b) { + return __builtin_arm_uhasx(__a, __b); +} +static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) +__uhsax(uint16x2_t __a, uint16x2_t __b) { + return __builtin_arm_uhsax(__a, __b); +} +static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) +__uhsub16(uint16x2_t __a, uint16x2_t __b) { + return __builtin_arm_uhsub16(__a, __b); +} +static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) +__uqadd16(uint16x2_t __a, uint16x2_t __b) { + return __builtin_arm_uqadd16(__a, __b); +} +static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) +__uqasx(uint16x2_t __a, uint16x2_t __b) { + return __builtin_arm_uqasx(__a, __b); +} +static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) +__uqsax(uint16x2_t __a, uint16x2_t __b) { + return __builtin_arm_uqsax(__a, __b); +} +static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) +__uqsub16(uint16x2_t __a, uint16x2_t __b) { + return __builtin_arm_uqsub16(__a, __b); +} +static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) +__usax(uint16x2_t __a, uint16x2_t __b) { + return __builtin_arm_usax(__a, __b); +} +static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) +__usub16(uint16x2_t __a, uint16x2_t __b) { + return __builtin_arm_usub16(__a, __b); +} +#endif + +/* 9.5.10 Parallel 16-bit multiplications */ +#if __ARM_FEATURE_SIMD32 +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smlad(int16x2_t __a, int16x2_t __b, int32_t __c) { + return __builtin_arm_smlad(__a, __b, __c); +} +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smladx(int16x2_t __a, int16x2_t __b, int32_t __c) { + return __builtin_arm_smladx(__a, __b, __c); +} +static __inline__ int64_t __attribute__((__always_inline__, __nodebug__)) +__smlald(int16x2_t __a, int16x2_t __b, int64_t __c) { + return __builtin_arm_smlald(__a, __b, __c); +} +static __inline__ int64_t __attribute__((__always_inline__, __nodebug__)) +__smlaldx(int16x2_t __a, int16x2_t __b, int64_t __c) { + return __builtin_arm_smlaldx(__a, __b, __c); +} +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smlsd(int16x2_t __a, int16x2_t __b, int32_t __c) { + return __builtin_arm_smlsd(__a, __b, __c); +} +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smlsdx(int16x2_t __a, int16x2_t __b, int32_t __c) { + return __builtin_arm_smlsdx(__a, __b, __c); +} +static __inline__ int64_t __attribute__((__always_inline__, __nodebug__)) +__smlsld(int16x2_t __a, int16x2_t __b, int64_t __c) { + return __builtin_arm_smlsld(__a, __b, __c); +} +static __inline__ int64_t __attribute__((__always_inline__, __nodebug__)) +__smlsldx(int16x2_t __a, int16x2_t __b, int64_t __c) { + return __builtin_arm_smlsldx(__a, __b, __c); +} +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smuad(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_smuad(__a, __b); +} +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smuadx(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_smuadx(__a, __b); +} +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smusd(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_smusd(__a, __b); +} +static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) +__smusdx(int16x2_t __a, int16x2_t __b) { + return __builtin_arm_smusdx(__a, __b); +} +#endif + /* 9.7 CRC32 intrinsics */ #if __ARM_FEATURE_CRC32 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) diff --git a/clang/test/CodeGen/arm_acle.c b/clang/test/CodeGen/arm_acle.c index b4f39be..7c8a5aa 100644 --- a/clang/test/CodeGen/arm_acle.c +++ b/clang/test/CodeGen/arm_acle.c @@ -76,7 +76,7 @@ void test_dbg(void) { // AArch32: call i32 @llvm.arm.strex // AArch64: call i64 @llvm.aarch64.ldxr // AArch64: call i32 @llvm.aarch64.stxr -uint32_t test_swp(uint32_t x, volatile void *p) { +void test_swp(uint32_t x, volatile void *p) { __swp(x, p); } @@ -118,6 +118,7 @@ void test_nop(void) { } /* 9 DATA-PROCESSING INTRINSICS */ + /* 9.2 Miscellaneous data-processing intrinsics */ // ARM-LABEL: test_ror // ARM: lshr @@ -266,8 +267,7 @@ uint64_t test_rbitll(uint64_t t) { } /* 9.4 Saturating intrinsics */ -#ifdef __ARM_32BIT_STATE - +#ifdef __ARM_FEATURE_SAT /* 9.4.1 Width-specified saturation intrinsics */ // AArch32-LABEL: test_ssat // AArch32: call i32 @llvm.arm.ssat(i32 %t, i32 1) @@ -277,11 +277,13 @@ int32_t test_ssat(int32_t t) { // AArch32-LABEL: test_usat // AArch32: call i32 @llvm.arm.usat(i32 %t, i32 2) -int32_t test_usat(int32_t t) { +uint32_t test_usat(int32_t t) { return __usat(t, 2); } +#endif /* 9.4.2 Saturating addition and subtraction intrinsics */ +#ifdef __ARM_FEATURE_DSP // AArch32-LABEL: test_qadd // AArch32: call i32 @llvm.arm.qadd(i32 %a, i32 %b) int32_t test_qadd(int32_t a, int32_t b) { @@ -304,6 +306,389 @@ int32_t test_qdbl() { } #endif +/* + * 9.3 16-bit multiplications + */ +#if __ARM_FEATURE_DSP +// AArch32-LABEL: test_smulbb +// AArch32: call i32 @llvm.arm.smulbb +int32_t test_smulbb(int32_t a, int32_t b) { + return __smulbb(a, b); +} +// AArch32-LABEL: test_smulbt +// AArch32: call i32 @llvm.arm.smulbt +int32_t test_smulbt(int32_t a, int32_t b) { + return __smulbt(a, b); +} +// AArch32-LABEL: test_smultb +// AArch32: call i32 @llvm.arm.smultb +int32_t test_smultb(int32_t a, int32_t b) { + return __smultb(a, b); +} +// AArch32-LABEL: test_smultt +// AArch32: call i32 @llvm.arm.smultt +int32_t test_smultt(int32_t a, int32_t b) { + return __smultt(a, b); +} +// AArch32-LABEL: test_smulwb +// AArch32: call i32 @llvm.arm.smulwb +int32_t test_smulwb(int32_t a, int32_t b) { + return __smulwb(a, b); +} +// AArch32-LABEL: test_smulwt +// AArch32: call i32 @llvm.arm.smulwt +int32_t test_smulwt(int32_t a, int32_t b) { + return __smulwt(a, b); +} +#endif + +/* 9.4.3 Accumultating multiplications */ +#if __ARM_FEATURE_DSP +// AArch32-LABEL: test_smlabb +// AArch32: call i32 @llvm.arm.smlabb(i32 %a, i32 %b, i32 %c) +int32_t test_smlabb(int32_t a, int32_t b, int32_t c) { + return __smlabb(a, b, c); +} +// AArch32-LABEL: test_smlabt +// AArch32: call i32 @llvm.arm.smlabt(i32 %a, i32 %b, i32 %c) +int32_t test_smlabt(int32_t a, int32_t b, int32_t c) { + return __smlabt(a, b, c); +} +// AArch32-LABEL: test_smlatb +// AArch32: call i32 @llvm.arm.smlatb(i32 %a, i32 %b, i32 %c) +int32_t test_smlatb(int32_t a, int32_t b, int32_t c) { + return __smlatb(a, b, c); +} +// AArch32-LABEL: test_smlatt +// AArch32: call i32 @llvm.arm.smlatt(i32 %a, i32 %b, i32 %c) +int32_t test_smlatt(int32_t a, int32_t b, int32_t c) { + return __smlatt(a, b, c); +} +// AArch32-LABEL: test_smlawb +// AArch32: call i32 @llvm.arm.smlawb(i32 %a, i32 %b, i32 %c) +int32_t test_smlawb(int32_t a, int32_t b, int32_t c) { + return __smlawb(a, b, c); +} +// AArch32-LABEL: test_smlawt +// AArch32: call i32 @llvm.arm.smlawt(i32 %a, i32 %b, i32 %c) +int32_t test_smlawt(int32_t a, int32_t b, int32_t c) { + return __smlawt(a, b, c); +} +#endif + +/* 9.5.4 Parallel 16-bit saturation */ +#if __ARM_FEATURE_SIMD32 +// AArch32-LABEL: test_ssat16 +// AArch32: call i32 @llvm.arm.ssat16 +int16x2_t test_ssat16(int16x2_t a) { + return __ssat16(a, 15); +} +// AArch32-LABEL: test_usat16 +// AArch32: call i32 @llvm.arm.usat16 +uint16x2_t test_usat16(int16x2_t a) { + return __usat16(a, 15); +} +#endif + +/* 9.5.5 Packing and unpacking */ +#if __ARM_FEATURE_SIMD32 +// AArch32-LABEL: test_sxtab16 +// AArch32: call i32 @llvm.arm.sxtab16 +int16x2_t test_sxtab16(int16x2_t a, int8x4_t b) { + return __sxtab16(a, b); +} +// AArch32-LABEL: test_sxtb16 +// AArch32: call i32 @llvm.arm.sxtb16 +int16x2_t test_sxtb16(int8x4_t a) { + return __sxtb16(a); +} +// AArch32-LABEL: test_uxtab16 +// AArch32: call i32 @llvm.arm.uxtab16 +int16x2_t test_uxtab16(int16x2_t a, int8x4_t b) { + return __uxtab16(a, b); +} +// AArch32-LABEL: test_uxtb16 +// AArch32: call i32 @llvm.arm.uxtb16 +int16x2_t test_uxtb16(int8x4_t a) { + return __uxtb16(a); +} +#endif + +/* 9.5.6 Parallel selection */ +#if __ARM_FEATURE_SIMD32 +// AArch32-LABEL: test_sel +// AArch32: call i32 @llvm.arm.sel +uint8x4_t test_sel(uint8x4_t a, uint8x4_t b) { + return __sel(a, b); +} +#endif + +/* 9.5.7 Parallel 8-bit addition and subtraction */ +#if __ARM_FEATURE_SIMD32 +// AArch32-LABEL: test_qadd8 +// AArch32: call i32 @llvm.arm.qadd8 +int16x2_t test_qadd8(int8x4_t a, int8x4_t b) { + return __qadd8(a, b); +} +// AArch32-LABEL: test_qsub8 +// AArch32: call i32 @llvm.arm.qsub8 +int8x4_t test_qsub8(int8x4_t a, int8x4_t b) { + return __qsub8(a, b); +} +// AArch32-LABEL: test_sadd8 +// AArch32: call i32 @llvm.arm.sadd8 +int8x4_t test_sadd8(int8x4_t a, int8x4_t b) { + return __sadd8(a, b); +} +// AArch32-LABEL: test_shadd8 +// AArch32: call i32 @llvm.arm.shadd8 +int8x4_t test_shadd8(int8x4_t a, int8x4_t b) { + return __shadd8(a, b); +} +// AArch32-LABEL: test_shsub8 +// AArch32: call i32 @llvm.arm.shsub8 +int8x4_t test_shsub8(int8x4_t a, int8x4_t b) { + return __shsub8(a, b); +} +// AArch32-LABEL: test_ssub8 +// AArch32: call i32 @llvm.arm.ssub8 +int8x4_t test_ssub8(int8x4_t a, int8x4_t b) { + return __ssub8(a, b); +} +// AArch32-LABEL: test_uadd8 +// AArch32: call i32 @llvm.arm.uadd8 +uint8x4_t test_uadd8(uint8x4_t a, uint8x4_t b) { + return __uadd8(a, b); +} +// AArch32-LABEL: test_uhadd8 +// AArch32: call i32 @llvm.arm.uhadd8 +uint8x4_t test_uhadd8(uint8x4_t a, uint8x4_t b) { + return __uhadd8(a, b); +} +// AArch32-LABEL: test_uhsub8 +// AArch32: call i32 @llvm.arm.uhsub8 +uint8x4_t test_uhsub8(uint8x4_t a, uint8x4_t b) { + return __uhsub8(a, b); +} +// AArch32-LABEL: test_uqadd8 +// AArch32: call i32 @llvm.arm.uqadd8 +uint8x4_t test_uqadd8(uint8x4_t a, uint8x4_t b) { + return __uqadd8(a, b); +} +// AArch32-LABEL: test_uqsub8 +// AArch32: call i32 @llvm.arm.uqsub8 +uint8x4_t test_uqsub8(uint8x4_t a, uint8x4_t b) { + return __uqsub8(a, b); +} +// AArch32-LABEL: test_usub8 +// AArch32: call i32 @llvm.arm.usub8 +uint8x4_t test_usub8(uint8x4_t a, uint8x4_t b) { + return __usub8(a, b); +} +#endif + +/* 9.5.8 Sum of 8-bit absolute differences */ +#if __ARM_FEATURE_SIMD32 +// AArch32-LABEL: test_usad8 +// AArch32: call i32 @llvm.arm.usad8 +uint32_t test_usad8(uint8x4_t a, uint8x4_t b) { + return __usad8(a, b); +} +// AArch32-LABEL: test_usada8 +// AArch32: call i32 @llvm.arm.usada8 +uint32_t test_usada8(uint8_t a, uint8_t b, uint8_t c) { + return __usada8(a, b, c); +} +#endif + +/* 9.5.9 Parallel 16-bit addition and subtraction */ +#if __ARM_FEATURE_SIMD32 +// AArch32-LABEL: test_qadd16 +// AArch32: call i32 @llvm.arm.qadd16 +int16x2_t test_qadd16(int16x2_t a, int16x2_t b) { + return __qadd16(a, b); +} +// AArch32-LABEL: test_qasx +// AArch32: call i32 @llvm.arm.qasx +int16x2_t test_qasx(int16x2_t a, int16x2_t b) { + return __qasx(a, b); +} +// AArch32-LABEL: test_qsax +// AArch32: call i32 @llvm.arm.qsax +int16x2_t test_qsax(int16x2_t a, int16x2_t b) { + return __qsax(a, b); +} +// AArch32-LABEL: test_qsub16 +// AArch32: call i32 @llvm.arm.qsub16 +int16x2_t test_qsub16(int16x2_t a, int16x2_t b) { + return __qsub16(a, b); +} +// AArch32-LABEL: test_sadd16 +// AArch32: call i32 @llvm.arm.sadd16 +int16x2_t test_sadd16(int16x2_t a, int16x2_t b) { + return __sadd16(a, b); +} +// AArch32-LABEL: test_sasx +// AArch32: call i32 @llvm.arm.sasx +int16x2_t test_sasx(int16x2_t a, int16x2_t b) { + return __sasx(a, b); +} +// AArch32-LABEL: test_shadd16 +// AArch32: call i32 @llvm.arm.shadd16 +int16x2_t test_shadd16(int16x2_t a, int16x2_t b) { + return __shadd16(a, b); +} +// AArch32-LABEL: test_shasx +// AArch32: call i32 @llvm.arm.shasx +int16x2_t test_shasx(int16x2_t a, int16x2_t b) { + return __shasx(a, b); +} +// AArch32-LABEL: test_shsax +// AArch32: call i32 @llvm.arm.shsax +int16x2_t test_shsax(int16x2_t a, int16x2_t b) { + return __shsax(a, b); +} +// AArch32-LABEL: test_shsub16 +// AArch32: call i32 @llvm.arm.shsub16 +int16x2_t test_shsub16(int16x2_t a, int16x2_t b) { + return __shsub16(a, b); +} +// AArch32-LABEL: test_ssax +// AArch32: call i32 @llvm.arm.ssax +int16x2_t test_ssax(int16x2_t a, int16x2_t b) { + return __ssax(a, b); +} +// AArch32-LABEL: test_ssub16 +// AArch32: call i32 @llvm.arm.ssub16 +int16x2_t test_ssub16(int16x2_t a, int16x2_t b) { + return __ssub16(a, b); +} +// AArch32-LABEL: test_uadd16 +// AArch32: call i32 @llvm.arm.uadd16 +uint16x2_t test_uadd16(uint16x2_t a, uint16x2_t b) { + return __uadd16(a, b); +} +// AArch32-LABEL: test_uasx +// AArch32: call i32 @llvm.arm.uasx +uint16x2_t test_uasx(uint16x2_t a, uint16x2_t b) { + return __uasx(a, b); +} +// AArch32-LABEL: test_uhadd16 +// AArch32: call i32 @llvm.arm.uhadd16 +uint16x2_t test_uhadd16(uint16x2_t a, uint16x2_t b) { + return __uhadd16(a, b); +} +// AArch32-LABEL: test_uhasx +// AArch32: call i32 @llvm.arm.uhasx +uint16x2_t test_uhasx(uint16x2_t a, uint16x2_t b) { + return __uhasx(a, b); +} +// AArch32-LABEL: test_uhsax +// AArch32: call i32 @llvm.arm.uhsax +uint16x2_t test_uhsax(uint16x2_t a, uint16x2_t b) { + return __uhsax(a, b); +} +// AArch32-LABEL: test_uhsub16 +// AArch32: call i32 @llvm.arm.uhsub16 +uint16x2_t test_uhsub16(uint16x2_t a, uint16x2_t b) { + return __uhsub16(a, b); +} +// AArch32-LABEL: test_uqadd16 +// AArch32: call i32 @llvm.arm.uqadd16 +uint16x2_t test_uqadd16(uint16x2_t a, uint16x2_t b) { + return __uqadd16(a, b); +} +// AArch32-LABEL: test_uqasx +// AArch32: call i32 @llvm.arm.uqasx +uint16x2_t test_uqasx(uint16x2_t a, uint16x2_t b) { + return __uqasx(a, b); +} +// AArch32-LABEL: test_uqsax +// AArch32: call i32 @llvm.arm.uqsax +uint16x2_t test_uqsax(uint16x2_t a, uint16x2_t b) { + return __uqsax(a, b); +} +// AArch32-LABEL: test_uqsub16 +// AArch32: call i32 @llvm.arm.uqsub16 +uint16x2_t test_uqsub16(uint16x2_t a, uint16x2_t b) { + return __uqsub16(a, b); +} +// AArch32-LABEL: test_usax +// AArch32: call i32 @llvm.arm.usax +uint16x2_t test_usax(uint16x2_t a, uint16x2_t b) { + return __usax(a, b); +} +// AArch32-LABEL: test_usub16 +// AArch32: call i32 @llvm.arm.usub16 +uint16x2_t test_usub16(uint16x2_t a, uint16x2_t b) { + return __usub16(a, b); +} +#endif + +/* 9.5.10 Parallel 16-bit multiplications */ +#if __ARM_FEATURE_SIMD32 +// AArch32-LABEL: test_smlad +// AArch32: call i32 @llvm.arm.smlad +int32_t test_smlad(int16x2_t a, int16x2_t b, int32_t c) { + return __smlad(a, b, c); +} +// AArch32-LABEL: test_smladx +// AArch32: call i32 @llvm.arm.smladx +int32_t test_smladx(int16x2_t a, int16x2_t b, int32_t c) { + return __smladx(a, b, c); +} +// AArch32-LABEL: test_smlald +// AArch32: call i64 @llvm.arm.smlald +int64_t test_smlald(int16x2_t a, int16x2_t b, int64_t c) { + return __smlald(a, b, c); +} +// AArch32-LABEL: test_smlaldx +// AArch32: call i64 @llvm.arm.smlaldx +int64_t test_smlaldx(int16x2_t a, int16x2_t b, int64_t c) { + return __smlaldx(a, b, c); +} +// AArch32-LABEL: test_smlsd +// AArch32: call i32 @llvm.arm.smlsd +int32_t test_smlsd(int16x2_t a, int16x2_t b, int32_t c) { + return __smlsd(a, b, c); +} +// AArch32-LABEL: test_smlsdx +// AArch32: call i32 @llvm.arm.smlsdx +int32_t test_smlsdx(int16x2_t a, int16x2_t b, int32_t c) { + return __smlsdx(a, b, c); +} +// AArch32-LABEL: test_smlsld +// AArch32: call i64 @llvm.arm.smlsld +int64_t test_smlsld(int16x2_t a, int16x2_t b, int64_t c) { + return __smlsld(a, b, c); +} +// AArch32-LABEL: test_smlsldx +// AArch32: call i64 @llvm.arm.smlsldx +int64_t test_smlsldx(int16x2_t a, int16x2_t b, int64_t c) { + return __smlsldx(a, b, c); +} +// AArch32-LABEL: test_smuad +// AArch32: call i32 @llvm.arm.smuad +int32_t test_smuad(int16x2_t a, int16x2_t b) { + return __smuad(a, b); +} +// AArch32-LABEL: test_smuadx +// AArch32: call i32 @llvm.arm.smuadx +int32_t test_smuadx(int16x2_t a, int16x2_t b) { + return __smuadx(a, b); +} +// AArch32-LABEL: test_smusd +// AArch32: call i32 @llvm.arm.smusd +int32_t test_smusd(int16x2_t a, int16x2_t b) { + return __smusd(a, b); +} +// AArch32-LABEL: test_smusdx +// AArch32: call i32 @llvm.arm.smusdx +int32_t test_smusdx(int16x2_t a, int16x2_t b) { + return __smusdx(a, b); +} +#endif + /* 9.7 CRC32 intrinsics */ // ARM-LABEL: test_crc32b // AArch32: call i32 @llvm.arm.crc32b -- 2.7.4