From: Paul A. Clarke Date: Tue, 9 Oct 2018 20:25:57 +0000 (+0000) Subject: This is a follow-on to earlier commits for adding compatibility implementations of... X-Git-Tag: upstream/12.2.0~28806 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=1fb0f8924f59decc2198de591d7fe80319419814;p=platform%2Fupstream%2Fgcc.git This is a follow-on to earlier commits for adding compatibility implementations of x86 intrinsics for PPC64LE. This is a follow-on to earlier commits for adding compatibility implementations of x86 intrinsics for PPC64LE. This is the first of two patches. This patch adds 11 of the 13 x86 intrinsics from ("SSE3"). (Patch 2/2 adds tests for these intrinsics, and briefly describes the tests performed.) Implementations are relatively straightforward, with occasional extra effort for vector element ordering. Not implemented are _mm_wait and _mm_monitor, as there are no direct or trivial analogs in the POWER ISA. ./gcc/ChangeLog: 2018-10-05 Paul A. Clarke * config.gcc (powerpc*-*-*): Add pmmintrin.h to extra_headers. * config/rs6000/pmmintrin.h: New file. From-SVN: r264991 --- diff --git a/gcc/ChangeLog b/gcc/ChangeLog index b4dc11f..6dfc2ef 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,8 @@ +2018-10-09 Paul A. Clarke + + * config.gcc (powerpc*-*-*): Add pmmintrin.h to extra_headers. + * config/rs6000/pmmintrin.h: New file. + 2018-10-09 Eric Botcazou PR tree-optimization/86659 diff --git a/gcc/config.gcc b/gcc/config.gcc index 0c579d1..8521f7d 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -484,6 +484,7 @@ powerpc*-*-*) extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h" extra_headers="${extra_headers} xmmintrin.h mm_malloc.h emmintrin.h" extra_headers="${extra_headers} mmintrin.h x86intrin.h" + extra_headers="${extra_headers} pmmintrin.h" extra_headers="${extra_headers} ppu_intrinsics.h spu2vmx.h vec_types.h si2vmx.h" extra_headers="${extra_headers} amo.h" case x$with_cpu in diff --git a/gcc/config/rs6000/pmmintrin.h b/gcc/config/rs6000/pmmintrin.h new file mode 100644 index 0000000..8cdd056 --- /dev/null +++ b/gcc/config/rs6000/pmmintrin.h @@ -0,0 +1,162 @@ +/* Copyright (C) 2003-2018 Free Software Foundation, Inc. + + This file is part of GCC. + + GCC is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 3, or (at your option) + any later version. + + GCC is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +/* Implemented from the specification included in the Intel C++ Compiler + User Guide and Reference, version 9.0. */ + +#ifndef NO_WARN_X86_INTRINSICS +/* This header is distributed to simplify porting x86_64 code that + makes explicit use of Intel intrinsics to powerpc64le. + It is the user's responsibility to determine if the results are + acceptable and make additional changes as necessary. + Note that much code that uses Intel intrinsics can be rewritten in + standard C or GNU C extensions, which are more portable and better + optimized across multiple targets. + + In the specific case of X86 SSE3 intrinsics, the PowerPC VMX/VSX ISA + is a good match for most SIMD operations. However the Horizontal + add/sub requires the data pairs be permuted into a separate + registers with vertical even/odd alignment for the operation. + And the addsub operation requires the sign of only the even numbered + elements be flipped (xored with -0.0). + For larger blocks of code using these intrinsic implementations, + the compiler be should be able to schedule instructions to avoid + additional latency. + + In the specific case of the monitor and mwait instructions there are + no direct equivalent in the PowerISA at this time. So those + intrinsics are not implemented. */ +#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning." +#endif + +#ifndef _PMMINTRIN_H_INCLUDED +#define _PMMINTRIN_H_INCLUDED + +/* We need definitions from the SSE2 and SSE header files*/ +#include + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_addsub_ps (__m128 __X, __m128 __Y) +{ + const __v4sf even_n0 = {-0.0, 0.0, -0.0, 0.0}; + __v4sf even_neg_Y = vec_xor(__Y, even_n0); + return (__m128) vec_add (__X, even_neg_Y); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_addsub_pd (__m128d __X, __m128d __Y) +{ + const __v2df even_n0 = {-0.0, 0.0}; + __v2df even_neg_Y = vec_xor(__Y, even_n0); + return (__m128d) vec_add (__X, even_neg_Y); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_ps (__m128 __X, __m128 __Y) +{ + __vector unsigned char xform2 = { + #ifdef __LITTLE_ENDIAN__ + 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B + #elif __BIG_ENDIAN__ + 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F + #endif + }; + __vector unsigned char xform1 = { + #ifdef __LITTLE_ENDIAN__ + 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F + #elif __BIG_ENDIAN__ + 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B + #endif + }; + return (__m128) vec_add (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2), + vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_ps (__m128 __X, __m128 __Y) +{ + __vector unsigned char xform2 = { + #ifdef __LITTLE_ENDIAN__ + 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B, 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B + #elif __BIG_ENDIAN__ + 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F, 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F + #endif + }; + __vector unsigned char xform1 = { + #ifdef __LITTLE_ENDIAN__ + 0x04, 0x05, 0x06, 0x07, 0x0C, 0x0D, 0x0E, 0x0F, 0x14, 0x15, 0x16, 0x17, 0x1C, 0x1D, 0x1E, 0x1F + #elif __BIG_ENDIAN__ + 0x10, 0x11, 0x12, 0x13, 0x18, 0x19, 0x1A, 0x1B, 0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0A, 0x0B + #endif + }; + return (__m128) vec_sub (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2), + vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hadd_pd (__m128d __X, __m128d __Y) +{ + return (__m128d) vec_add (vec_mergeh ((__v2df) __X, (__v2df)__Y), + vec_mergel ((__v2df) __X, (__v2df)__Y)); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_hsub_pd (__m128d __X, __m128d __Y) +{ + return (__m128d) vec_sub (vec_mergeh ((__v2df) __X, (__v2df)__Y), + vec_mergel ((__v2df) __X, (__v2df)__Y)); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movehdup_ps (__m128 __X) +{ + return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X); +} + +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_moveldup_ps (__m128 __X) +{ + return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_loaddup_pd (double const *__P) +{ + return (__m128d) vec_splats (*__P); +} + +extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_movedup_pd (__m128d __X) +{ + return _mm_shuffle_pd (__X, __X, _MM_SHUFFLE2 (0,0)); +} + +extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__)) +_mm_lddqu_si128 (__m128i const *__P) +{ + return (__m128i) (vec_vsx_ld(0, (signed int const *)__P)); +} + +/* POWER8 / POWER9 have no equivalent for _mm_monitor nor _mm_wait. */ + +#endif /* _PMMINTRIN_H_INCLUDED */