From 4901c1b4f34955b9b6c8cba15eab6fce033c02a6 Mon Sep 17 00:00:00 2001 From: hubicka Date: Mon, 21 Oct 2002 22:09:06 +0000 Subject: [PATCH] * i386.c (builtin_description): Add punpcklqdq and movdq2q (ix86_init_mmx_sse_builtins): Add v2di_ftype_void, di_ftype_v2di, v16qi_ftype_pchar, void_ftype_pchar_v16qi, v4si_ftype_pchar, void_ftype_pchar_v4si; Initialize __builtin_ia32_movdq2q, __builtin_ia32_loaddqa, __builtin_ia32_loaddqu, __builtin_ia32_loadd __builtin_ia32_storedqa, __builtin_ia32_storedqu, __builtin_ia32_stored __builtin_ia32_setzero128. (ix86_expand_builtin): Handle IX86_BUILTIN_CLRTI, IX86_BUILTIN_LOADDQA, IX86_BUILTIN_LOADDQU, IX86_BUILTIN_LOADD, IX86_BUILTIN_STOREDQA, IX86_BUILTIN_STOREDQU, IX86_BUILTIN_STORED, Ix86_BUILTIN_MOVQ. * i386.h (ix86_builtins): Add IX86_BUILTIN_LOADDQA, IX86_BUILTIN_LOADDQU, IX86_BUILTIN_STOREDQA, IX86_BUILTIN_STOREDQU, IX86_BUILTIN_LOADD, IX86_BUILTIN_STORED, IX86_BUILTIN_CLRTI, IX86_BUILTIN_MOVDQ2Q, IX86_BUILTIN_PUNPCKLQDQ128, Ix86_BUILTIN_MOVQ. * i386.md (sse2_punpcklqdq, sse2_movqsse2_loadd, sse2_stored, sse2_movq): New patterns. (sse2_movdqa, sse2_movdqu, sse2_movdq2q): Fix. * xmmintrin.h (_mm_load_si128, _mm_loadu_si128, _mm_loadl_epi64, _mm_store_si128, _mm_storeu_si128, _mm_storel_epi64, _mm_setzero_si128, _mm_set_epi64, _mm_set_epi32, _mm_set_epi16, _mm_set_epi8, _mm_set1_epi64, _mm_set1_epi32, _mm_set1_epi16, _mm_set1_epi8, _mm_setr_epi64, _mm_setr_epi32, _mm_setr_epi16, _mm_setr_epi8, _mm_unpacklo_epi64,_mm_set_moveq): New functions. (_mm_insert_epi16): Fix. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@58391 138bc75d-0d04-0410-961f-82ee72b054a4 --- gcc/ChangeLog | 27 +++++ gcc/config/i386/i386.c | 56 +++++++++- gcc/config/i386/i386.h | 12 +++ gcc/config/i386/i386.md | 80 +++++++++++--- gcc/config/i386/xmmintrin.h | 248 +++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 407 insertions(+), 16 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 6146ec0..1660ae4 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,30 @@ +Tue Oct 22 00:04:20 CEST 2002 Jan Hubicka + + * i386.c (builtin_description): Add punpcklqdq and movdq2q + (ix86_init_mmx_sse_builtins): Add v2di_ftype_void, di_ftype_v2di, + v16qi_ftype_pchar, void_ftype_pchar_v16qi, v4si_ftype_pchar, + void_ftype_pchar_v4si; Initialize __builtin_ia32_movdq2q, + __builtin_ia32_loaddqa, __builtin_ia32_loaddqu, __builtin_ia32_loadd + __builtin_ia32_storedqa, __builtin_ia32_storedqu, __builtin_ia32_stored + __builtin_ia32_setzero128. + (ix86_expand_builtin): Handle IX86_BUILTIN_CLRTI, IX86_BUILTIN_LOADDQA, + IX86_BUILTIN_LOADDQU, IX86_BUILTIN_LOADD, IX86_BUILTIN_STOREDQA, + IX86_BUILTIN_STOREDQU, IX86_BUILTIN_STORED, Ix86_BUILTIN_MOVQ. + * i386.h (ix86_builtins): Add IX86_BUILTIN_LOADDQA, IX86_BUILTIN_LOADDQU, + IX86_BUILTIN_STOREDQA, IX86_BUILTIN_STOREDQU, IX86_BUILTIN_LOADD, + IX86_BUILTIN_STORED, IX86_BUILTIN_CLRTI, IX86_BUILTIN_MOVDQ2Q, + IX86_BUILTIN_PUNPCKLQDQ128, Ix86_BUILTIN_MOVQ. + * i386.md (sse2_punpcklqdq, sse2_movqsse2_loadd, sse2_stored, + sse2_movq): New patterns. + (sse2_movdqa, sse2_movdqu, sse2_movdq2q): Fix. + * xmmintrin.h (_mm_load_si128, _mm_loadu_si128, _mm_loadl_epi64, + _mm_store_si128, _mm_storeu_si128, _mm_storel_epi64, + _mm_setzero_si128, _mm_set_epi64, _mm_set_epi32, _mm_set_epi16, + _mm_set_epi8, _mm_set1_epi64, _mm_set1_epi32, _mm_set1_epi16, + _mm_set1_epi8, _mm_setr_epi64, _mm_setr_epi32, _mm_setr_epi16, + _mm_setr_epi8, _mm_unpacklo_epi64,_mm_set_moveq): New functions. + (_mm_insert_epi16): Fix. + 2002-10-21 Dale Johannesen * config/rs6000/rs6000.c (rs6000_reverse_condition): Handle diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c index fab6177..2b857bf 100644 --- a/gcc/config/i386/i386.c +++ b/gcc/config/i386/i386.c @@ -797,6 +797,7 @@ const struct attribute_spec ix86_attribute_table[]; static tree ix86_handle_cdecl_attribute PARAMS ((tree *, tree, tree, int, bool *)); static tree ix86_handle_regparm_attribute PARAMS ((tree *, tree, tree, int, bool *)); static int ix86_value_regno PARAMS ((enum machine_mode)); +static bool contains_128bit_aligned_vector_p PARAMS ((tree)); #if defined (DO_GLOBAL_CTORS_BODY) && defined (HAS_INIT_SECTION) static void ix86_svr3_asm_out_constructor PARAMS ((rtx, int)); @@ -12117,6 +12118,7 @@ static const struct builtin_description bdesc_2arg[] = { MASK_SSE2, CODE_FOR_sse2_punpcklbw, "__builtin_ia32_punpcklbw128", IX86_BUILTIN_PUNPCKLBW128, 0, 0 }, { MASK_SSE2, CODE_FOR_sse2_punpcklwd, "__builtin_ia32_punpcklwd128", IX86_BUILTIN_PUNPCKLWD128, 0, 0 }, { MASK_SSE2, CODE_FOR_sse2_punpckldq, "__builtin_ia32_punpckldq128", IX86_BUILTIN_PUNPCKLDQ128, 0, 0 }, + { MASK_SSE2, CODE_FOR_sse2_punpcklqdq, "__builtin_ia32_punpcklqdq128", IX86_BUILTIN_PUNPCKLQDQ128, 0, 0 }, { MASK_SSE2, CODE_FOR_sse2_packsswb, "__builtin_ia32_packsswb128", IX86_BUILTIN_PACKSSWB128, 0, 0 }, { MASK_SSE2, CODE_FOR_sse2_packssdw, "__builtin_ia32_packssdw128", IX86_BUILTIN_PACKSSDW128, 0, 0 }, @@ -12168,6 +12170,7 @@ static const struct builtin_description bdesc_1arg[] = { MASK_SSE2, CODE_FOR_sse2_pmovmskb, 0, IX86_BUILTIN_PMOVMSKB128, 0, 0 }, { MASK_SSE2, CODE_FOR_sse2_movmskpd, 0, IX86_BUILTIN_MOVMSKPD, 0, 0 }, { MASK_SSE2, CODE_FOR_sse2_movq2dq, 0, IX86_BUILTIN_MOVQ2DQ, 0, 0 }, + { MASK_SSE2, CODE_FOR_sse2_movdq2q, 0, IX86_BUILTIN_MOVDQ2Q, 0, 0 }, { MASK_SSE2, CODE_FOR_sqrtv2df2, 0, IX86_BUILTIN_SQRTPD, 0, 0 }, @@ -12187,7 +12190,9 @@ static const struct builtin_description bdesc_1arg[] = { MASK_SSE2, CODE_FOR_cvtps2dq, 0, IX86_BUILTIN_CVTPS2DQ, 0, 0 }, { MASK_SSE2, CODE_FOR_cvtps2pd, 0, IX86_BUILTIN_CVTPS2PD, 0, 0 }, - { MASK_SSE2, CODE_FOR_cvttps2dq, 0, IX86_BUILTIN_CVTTPS2DQ, 0, 0 } + { MASK_SSE2, CODE_FOR_cvttps2dq, 0, IX86_BUILTIN_CVTTPS2DQ, 0, 0 }, + + { MASK_SSE2, CODE_FOR_sse2_movq, 0, IX86_BUILTIN_MOVQ, 0, 0 } }; void @@ -12343,6 +12348,8 @@ ix86_init_mmx_sse_builtins () tree ti_ftype_void = build_function_type (intTI_type_node, void_list_node); + tree v2di_ftype_void + = build_function_type (V2DI_type_node, void_list_node); tree ti_ftype_ti_ti = build_function_type_list (intTI_type_node, intTI_type_node, intTI_type_node, NULL_TREE); @@ -12351,6 +12358,9 @@ ix86_init_mmx_sse_builtins () tree v2di_ftype_di = build_function_type_list (V2DI_type_node, long_long_unsigned_type_node, NULL_TREE); + tree di_ftype_v2di + = build_function_type_list (long_long_unsigned_type_node, + V2DI_type_node, NULL_TREE); tree v4sf_ftype_v4si = build_function_type_list (V4SF_type_node, V4SI_type_node, NULL_TREE); tree v4si_ftype_v4sf @@ -12459,6 +12469,18 @@ ix86_init_mmx_sse_builtins () V16QI_type_node, V16QI_type_node, NULL_TREE); tree int_ftype_v16qi = build_function_type_list (integer_type_node, V16QI_type_node, NULL_TREE); + tree v16qi_ftype_pchar + = build_function_type_list (V16QI_type_node, pchar_type_node, NULL_TREE); + tree void_ftype_pchar_v16qi + = build_function_type_list (void_type_node, + pchar_type_node, V16QI_type_node, NULL_TREE); + tree v4si_ftype_pchar + = build_function_type_list (V4SI_type_node, pchar_type_node, NULL_TREE); + tree void_ftype_pchar_v4si + = build_function_type_list (void_type_node, + pchar_type_node, V4SI_type_node, NULL_TREE); + tree v2di_ftype_v2di + = build_function_type_list (V2DI_type_node, V2DI_type_node, NULL_TREE); /* Add all builtins that are more or less simple operations on two operands. */ @@ -12639,6 +12661,7 @@ ix86_init_mmx_sse_builtins () def_builtin (MASK_SSE2, "__builtin_ia32_maskmovdqu", void_ftype_v16qi_v16qi_pchar, IX86_BUILTIN_MASKMOVDQU); def_builtin (MASK_SSE2, "__builtin_ia32_movq2dq", v2di_ftype_di, IX86_BUILTIN_MOVQ2DQ); + def_builtin (MASK_SSE2, "__builtin_ia32_movdq2q", di_ftype_v2di, IX86_BUILTIN_MOVDQ2Q); def_builtin (MASK_SSE2, "__builtin_ia32_loadapd", v2df_ftype_pdouble, IX86_BUILTIN_LOADAPD); def_builtin (MASK_SSE2, "__builtin_ia32_loadupd", v2df_ftype_pdouble, IX86_BUILTIN_LOADUPD); @@ -12702,6 +12725,16 @@ ix86_init_mmx_sse_builtins () def_builtin (MASK_SSE2, "__builtin_ia32_lfence", void_ftype_void, IX86_BUILTIN_LFENCE); def_builtin (MASK_SSE2, "__builtin_ia32_mfence", void_ftype_void, IX86_BUILTIN_MFENCE); + def_builtin (MASK_SSE2, "__builtin_ia32_loaddqa", v16qi_ftype_pchar, IX86_BUILTIN_LOADDQA); + def_builtin (MASK_SSE2, "__builtin_ia32_loaddqu", v16qi_ftype_pchar, IX86_BUILTIN_LOADDQU); + def_builtin (MASK_SSE2, "__builtin_ia32_loadd", v4si_ftype_pchar, IX86_BUILTIN_LOADD); + def_builtin (MASK_SSE2, "__builtin_ia32_storedqa", void_ftype_pchar_v16qi, IX86_BUILTIN_STOREDQA); + def_builtin (MASK_SSE2, "__builtin_ia32_storedqu", void_ftype_pchar_v16qi, IX86_BUILTIN_STOREDQU); + def_builtin (MASK_SSE2, "__builtin_ia32_stored", void_ftype_pchar_v4si, IX86_BUILTIN_STORED); + def_builtin (MASK_SSE2, "__builtin_ia32_movq", v2di_ftype_v2di, IX86_BUILTIN_MOVQ); + + def_builtin (MASK_SSE1, "__builtin_ia32_setzero128", v2di_ftype_void, IX86_BUILTIN_CLRTI); + def_builtin (MASK_SSE2, "__builtin_ia32_psllw128", v8hi_ftype_v8hi_v2di, IX86_BUILTIN_PSLLW128); def_builtin (MASK_SSE2, "__builtin_ia32_pslld128", v4si_ftype_v4si_v2di, IX86_BUILTIN_PSLLD128); def_builtin (MASK_SSE2, "__builtin_ia32_psllq128", v2di_ftype_v2di_v2di, IX86_BUILTIN_PSLLQ128); @@ -13153,6 +13186,7 @@ ix86_expand_builtin (exp, target, subtarget, mode, ignore) case IX86_BUILTIN_STOREAPS: return ix86_expand_store_builtin (CODE_FOR_sse_movaps, arglist); + case IX86_BUILTIN_STOREUPS: return ix86_expand_store_builtin (CODE_FOR_sse_movups, arglist); @@ -13421,6 +13455,12 @@ ix86_expand_builtin (exp, target, subtarget, mode, ignore) emit_insn (gen_mmx_clrdi (target)); return target; + case IX86_BUILTIN_CLRTI: + target = gen_reg_rtx (V2DImode); + emit_insn (gen_sse2_clrti (simplify_gen_subreg (TImode, target, V2DImode, 0))); + return target; + + case IX86_BUILTIN_SQRTSD: return ix86_expand_unop1_builtin (CODE_FOR_vmsqrtv2df2, arglist, target); case IX86_BUILTIN_LOADAPD: @@ -13507,6 +13547,20 @@ ix86_expand_builtin (exp, target, subtarget, mode, ignore) case IX86_BUILTIN_MOVNTI: return ix86_expand_store_builtin (CODE_FOR_sse2_movntsi, arglist); + case IX86_BUILTIN_LOADDQA: + return ix86_expand_unop_builtin (CODE_FOR_sse2_movdqa, arglist, target, 1); + case IX86_BUILTIN_LOADDQU: + return ix86_expand_unop_builtin (CODE_FOR_sse2_movdqu, arglist, target, 1); + case IX86_BUILTIN_LOADD: + return ix86_expand_unop_builtin (CODE_FOR_sse2_loadd, arglist, target, 1); + + case IX86_BUILTIN_STOREDQA: + return ix86_expand_store_builtin (CODE_FOR_sse2_movdqa, arglist); + case IX86_BUILTIN_STOREDQU: + return ix86_expand_store_builtin (CODE_FOR_sse2_movdqu, arglist); + case IX86_BUILTIN_STORED: + return ix86_expand_store_builtin (CODE_FOR_sse2_stored, arglist); + default: break; } diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index 8ad3cd4..f832e10 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -2090,6 +2090,16 @@ enum ix86_builtins IX86_BUILTIN_MOVNTPS, IX86_BUILTIN_MOVNTQ, + IX86_BUILTIN_LOADDQA, + IX86_BUILTIN_LOADDQU, + IX86_BUILTIN_STOREDQA, + IX86_BUILTIN_STOREDQU, + IX86_BUILTIN_MOVQ, + IX86_BUILTIN_LOADD, + IX86_BUILTIN_STORED, + + IX86_BUILTIN_CLRTI, + IX86_BUILTIN_PACKSSWB, IX86_BUILTIN_PACKSSDW, IX86_BUILTIN_PACKUSWB, @@ -2336,6 +2346,7 @@ enum ix86_builtins IX86_BUILTIN_MOVMSKPD, IX86_BUILTIN_PMOVMSKB128, IX86_BUILTIN_MOVQ2DQ, + IX86_BUILTIN_MOVDQ2Q, IX86_BUILTIN_PACKSSWB128, IX86_BUILTIN_PACKSSDW128, @@ -2419,6 +2430,7 @@ enum ix86_builtins IX86_BUILTIN_PUNPCKLBW128, IX86_BUILTIN_PUNPCKLWD128, IX86_BUILTIN_PUNPCKLDQ128, + IX86_BUILTIN_PUNPCKLQDQ128, IX86_BUILTIN_CLFLUSH, IX86_BUILTIN_MFENCE, diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index bb679f7..a499131 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -21545,6 +21545,19 @@ [(set_attr "type" "ssecvt") (set_attr "mode" "TI")]) +(define_insn "sse2_punpcklqdq" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (vec_merge:V2DI + (match_operand:V2DI 1 "register_operand" "0") + (vec_select:V2DI (match_operand:V2DI 2 "register_operand" "x") + (parallel [(const_int 1) + (const_int 0)])) + (const_int 1)))] + "TARGET_SSE2" + "punpcklqdq\t{%2, %0|%0, %2}" + [(set_attr "type" "ssecvt") + (set_attr "mode" "TI")]) + ;; SSE2 moves (define_insn "sse2_movapd" @@ -21570,9 +21583,9 @@ (set_attr "mode" "V2DF")]) (define_insn "sse2_movdqa" - [(set (match_operand:TI 0 "nonimmediate_operand" "=x,m") - (unspec:TI [(match_operand:TI 1 "general_operand" "xm,x")] - UNSPEC_MOVA))] + [(set (match_operand:V16QI 0 "nonimmediate_operand" "=x,m") + (unspec:V16QI [(match_operand:V16QI 1 "general_operand" "xm,x")] + UNSPEC_MOVA))] "TARGET_SSE2" "@ movdqa\t{%1, %0|%0, %1} @@ -21581,9 +21594,9 @@ (set_attr "mode" "TI")]) (define_insn "sse2_movdqu" - [(set (match_operand:TI 0 "nonimmediate_operand" "=x,m") - (unspec:TI [(match_operand:TI 1 "general_operand" "xm,x")] - UNSPEC_MOVU))] + [(set (match_operand:V16QI 0 "nonimmediate_operand" "=x,m") + (unspec:V16QI [(match_operand:V16QI 1 "general_operand" "xm,x")] + UNSPEC_MOVU))] "TARGET_SSE2" "@ movdqu\t{%1, %0|%0, %1} @@ -21592,21 +21605,60 @@ (set_attr "mode" "TI")]) (define_insn "sse2_movdq2q" - [(set (match_operand:DI 0 "nonimmediate_operand" "=y") - (vec_select:DI (match_operand:V2DI 1 "general_operand" "x") + [(set (match_operand:DI 0 "nonimmediate_operand" "=m,y") + (vec_select:DI (match_operand:V2DI 1 "register_operand" "x,x") (parallel [(const_int 0)])))] "TARGET_SSE2" - "movdq2q\t{%1, %0|%0, %1}" + "@ + movq\t{%1, %0|%0, %1} + movdq2q\t{%1, %0|%0, %1}" [(set_attr "type" "ssecvt") (set_attr "mode" "TI")]) (define_insn "sse2_movq2dq" - [(set (match_operand:V2DI 0 "nonimmediate_operand" "=x") - (vec_concat:V2DI (match_operand:DI 1 "general_operand" "y") - (const_vector:DI [(const_int 0)])))] + [(set (match_operand:V2DI 0 "register_operand" "=x,?x") + (vec_concat:V2DI (match_operand:DI 1 "nonimmediate_operand" "m,y") + (const_int 0)))] "TARGET_SSE2" - "movq2dq\t{%1, %0|%0, %1}" - [(set_attr "type" "ssecvt") + "@ + movq\t{%1, %0|%0, %1} + movq2dq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssecvt,ssemov") + (set_attr "mode" "TI")]) + +(define_insn "sse2_movq" + [(set (match_operand:V2DI 0 "register_operand" "=x") + (vec_concat:V2DI (vec_select:DI + (match_operand:V2DI 1 "nonimmediate_operand" "xm") + (parallel [(const_int 0)])) + (const_int 0)))] + "TARGET_SSE2" + "movq\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "mode" "TI")]) + +(define_insn "sse2_loadd" + [(set (match_operand:V4SI 0 "register_operand" "=x") + (vec_merge:V4SI + (vec_duplicate:V4HI (match_operand:SI 1 "nonimmediate_operand" "mr")) + (const_vector:V4SI [(const_int 0) + (const_int 0) + (const_int 0) + (const_int 0)]) + (const_int 1)))] + "TARGET_SSE2" + "movd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") + (set_attr "mode" "TI")]) + +(define_insn "sse2_stored" + [(set (match_operand:SI 0 "nonimmediate_operand" "=mr") + (vec_select:SI + (match_operand:V4SI 1 "register_operand" "x") + (parallel [(const_int 0)])))] + "TARGET_SSE2" + "movd\t{%1, %0|%0, %1}" + [(set_attr "type" "ssemov") (set_attr "mode" "TI")]) (define_insn "sse2_movhpd" diff --git a/gcc/config/i386/xmmintrin.h b/gcc/config/i386/xmmintrin.h index 8c5d41b..08ef76a 100644 --- a/gcc/config/i386/xmmintrin.h +++ b/gcc/config/i386/xmmintrin.h @@ -1581,6 +1581,246 @@ _mm_ucomineq_sd (__m128d __A, __m128d __B) return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B); } +/* Create a vector with element 0 as *P and the rest zero. */ + +static __inline __m128i +_mm_load_si128 (__m128i const *__P) +{ + return (__m128i) __builtin_ia32_loaddqa (__P); +} + +static __inline __m128i +_mm_loadu_si128 (__m128i const *__P) +{ + return (__m128i) __builtin_ia32_loaddqu (__P); +} + +static __inline __m128i +_mm_loadl_epi64 (__m128i const *__P) +{ + return (__m128i) __builtin_ia32_movq2dq (*(unsigned long long *)__P); +} + +static __inline void +_mm_store_si128 (__m128i *__P, __m128i __B) +{ + __builtin_ia32_storedqa (__P, (__v16qi)__B); +} + +static __inline void +_mm_storeu_si128 (__m128i *__P, __m128i __B) +{ + __builtin_ia32_storedqu (__P, (__v16qi)__B); +} + +static __inline void +_mm_storel_epi64 (__m128i *__P, __m128i __B) +{ + *(long long *)__P = __builtin_ia32_movdq2q ((__v2di)__B); +} + +static __inline __m128i +_mm_move_epi64 (__m128i __A) +{ + return (__m128i) __builtin_ia32_movq ((__v2di)__A); +} + +/* Create a vector of zeros. */ +static __inline __m128i +_mm_setzero_si128 (void) +{ + return (__m128i) __builtin_ia32_setzero128 (); +} + +static __inline __m128i +_mm_set_epi64 (__m64 __A, __m64 __B) +{ + __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A); + __v2di __tmp2 = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__B); + return (__m128i)__builtin_ia32_punpcklqdq128 (__tmp2, __tmp); +} + +/* Create the vector [Z Y X W]. */ +static __inline __m128i +_mm_set_epi32 (int __Z, int __Y, int __X, int __W) +{ + union { + int __a[4]; + __m128i __v; + } __u; + + __u.__a[0] = __W; + __u.__a[1] = __X; + __u.__a[2] = __Y; + __u.__a[3] = __Z; + + return __u.__v; +} +/* Create the vector [S T U V Z Y X W]. */ +static __inline __m128i +_mm_set_epi16 (short __Z, short __Y, short __X, short __W, + short __V, short __U, short __T, short __S) +{ + union { + short __a[8]; + __m128i __v; + } __u; + + __u.__a[0] = __S; + __u.__a[1] = __T; + __u.__a[2] = __U; + __u.__a[3] = __V; + __u.__a[4] = __W; + __u.__a[5] = __X; + __u.__a[6] = __Y; + __u.__a[7] = __Z; + + return __u.__v; +} + +/* Create the vector [S T U V Z Y X W]. */ +static __inline __m128i +_mm_set_epi8 (char __Z, char __Y, char __X, char __W, + char __V, char __U, char __T, char __S, + char __Z1, char __Y1, char __X1, char __W1, + char __V1, char __U1, char __T1, char __S1) +{ + union { + char __a[16]; + __m128i __v; + } __u; + + __u.__a[0] = __S1; + __u.__a[1] = __T1; + __u.__a[2] = __U1; + __u.__a[3] = __V1; + __u.__a[4] = __W1; + __u.__a[5] = __X1; + __u.__a[6] = __Y1; + __u.__a[7] = __Z1; + __u.__a[8] = __S; + __u.__a[9] = __T; + __u.__a[10] = __U; + __u.__a[11] = __V; + __u.__a[12] = __W; + __u.__a[13] = __X; + __u.__a[14] = __Y; + __u.__a[15] = __Z; + + return __u.__v; +} + +static __inline __m128i +_mm_set1_epi64 (__m64 __A) +{ + __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A); + return (__m128i)__builtin_ia32_punpcklqdq128 (__tmp, __tmp); +} + +static __inline __m128i +_mm_set1_epi32 (int __A) +{ + __v4si __tmp = (__v4si)__builtin_ia32_loadd (&__A); + return (__m128i) __builtin_ia32_pshufd ((__v4si)__tmp, _MM_SHUFFLE (0,0,0,0)); +} + +static __inline __m128i +_mm_set1_epi16 (short __A) +{ + int __Acopy = (unsigned short)__A; + __v4si __tmp = (__v4si)__builtin_ia32_loadd (&__Acopy); + __tmp = (__v4si)__builtin_ia32_punpcklwd128 ((__v8hi)__tmp, (__v8hi)__tmp); + return (__m128i) __builtin_ia32_pshufd ((__v4si)__tmp, _MM_SHUFFLE (0,0,0,0)); +} + +static __inline __m128i +_mm_set1_epi8 (char __A) +{ + int __Acopy = (unsigned char)__A; + __v4si __tmp = (__v4si)__builtin_ia32_loadd (&__Acopy); + __tmp = (__v4si)__builtin_ia32_punpcklbw128 ((__v16qi)__tmp, (__v16qi)__tmp); + __tmp = (__v4si)__builtin_ia32_punpcklbw128 ((__v16qi)__tmp, (__v16qi)__tmp); + return (__m128i) __builtin_ia32_pshufd ((__v4si)__tmp, _MM_SHUFFLE (0,0,0,0)); +} + +static __inline __m128i +_mm_setr_epi64 (__m64 __A, __m64 __B) +{ + __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A); + __v2di __tmp2 = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__B); + return (__m128i)__builtin_ia32_punpcklqdq128 (__tmp, __tmp2); +} + +/* Create the vector [Z Y X W]. */ +static __inline __m128i +_mm_setr_epi32 (int __W, int __X, int __Y, int __Z) +{ + union { + int __a[4]; + __m128i __v; + } __u; + + __u.__a[0] = __W; + __u.__a[1] = __X; + __u.__a[2] = __Y; + __u.__a[3] = __Z; + + return __u.__v; +} +/* Create the vector [S T U V Z Y X W]. */ +static __inline __m128i +_mm_setr_epi16 (short __S, short __T, short __U, short __V, + short __W, short __X, short __Y, short __Z) +{ + union { + short __a[8]; + __m128i __v; + } __u; + + __u.__a[0] = __S; + __u.__a[1] = __T; + __u.__a[2] = __U; + __u.__a[3] = __V; + __u.__a[4] = __W; + __u.__a[5] = __X; + __u.__a[6] = __Y; + __u.__a[7] = __Z; + + return __u.__v; +} + +/* Create the vector [S T U V Z Y X W]. */ +static __inline __m128i +_mm_setr_epi8 (char __S1, char __T1, char __U1, char __V1, + char __W1, char __X1, char __Y1, char __Z1, + char __S, char __T, char __U, char __V, + char __W, char __X, char __Y, char __Z) +{ + union { + char __a[16]; + __m128i __v; + } __u; + + __u.__a[0] = __S1; + __u.__a[1] = __T1; + __u.__a[2] = __U1; + __u.__a[3] = __V1; + __u.__a[4] = __W1; + __u.__a[5] = __X1; + __u.__a[6] = __Y1; + __u.__a[7] = __Z1; + __u.__a[8] = __S; + __u.__a[9] = __T; + __u.__a[10] = __U; + __u.__a[11] = __V; + __u.__a[12] = __W; + __u.__a[13] = __X; + __u.__a[14] = __Y; + __u.__a[15] = __Z; + + return __u.__v; +} + static __inline __m128d _mm_cvtepi32_pd (__m128i __A) { @@ -1776,6 +2016,12 @@ _mm_unpacklo_epi32 (__m128i __A, __m128i __B) } static __inline __m128i +_mm_unpacklo_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_punpcklqdq128 ((__v2di)__A, (__v2di)__B); +} + +static __inline __m128i _mm_add_epi8 (__m128i __A, __m128i __B) { return (__m128i)__builtin_ia32_paddb128 ((__v16qi)__A, (__v16qi)__B); @@ -2075,7 +2321,7 @@ _mm_cmpgt_epi32 (__m128i __A, __m128i __B) #define _mm_extract_epi16(__A, __B) __builtin_ia32_pextrw128 ((__v8hi)__A, __B) -#define _mm_insert_epi16 (__A, __B, __C) ((__m128i)__builtin_ia32_pinsrw128 ((__v8hi)__A, __B, __C)) +#define _mm_insert_epi16(__A, __B, __C) ((__m128i)__builtin_ia32_pinsrw128 ((__v8hi)__A, __B, __C)) static __inline __m128i _mm_max_epi16 (__m128i __A, __m128i __B) -- 2.7.4