From 95879c728b9a59ae67db022ad370eb66374090f3 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl@gcc.gnu.org>
Date: Thu, 28 Aug 2008 12:18:44 -0700
Subject: [PATCH] [multiple changes]

2008-08-28  H.J. Lu  <hongjiu.lu@intel.com>
	    Joey Ye  <joey.ye@intel.com>
	    Xuepeng Guo  <xuepeng.guo@intel.com>

	* config.gcc (extra_headers): Add gmmintrin.h for x86 and x86-64.

	* config/i386/cpuid.h (bit_FMA): New.
	(bit_XSAVE): Likewise.
	(bit_OSXSAVE): Likewise.
	(bit_AVX): Likewise.

	* config/i386/gas.h (ASM_OUTPUT_OPCODE): Undefine before
	define.  Use ASM_OUTPUT_AVX_PREFIX.

	* config/i386/gmmintrin.h: New.

	* config/i386/i386.c (x86_64_reg_class): Add X86_64_AVX_CLASS.
	(OPTION_MASK_ISA_AVX_SET): New.
	(OPTION_MASK_ISA_FMA_SET): Likewise.
	(OPTION_MASK_ISA_AVX_UNSET): Likewise.
	(OPTION_MASK_ISA_FMA_SET): Likewise.
	(OPTION_MASK_ISA_SSE4_2_UNSET): Updated.
	(ix86_handle_option): Handle OPT_mavx and OPT_mfma.
	(pta_flags): Add PTA_AVX and PTA_FMA.
	(override_options): Handle PTA_AVX and PTA_FMA.
	(init_cumulative_args): Handle warn_avx.
	(classify_argument): Return 0 for COImode and OImode.  Return
	1 and X86_64_AVX_CLASS for 256bit vector types.
	(examine_argument): Handle X86_64_AVX_CLASS.
	(construct_container): Likewise.
	(function_arg_advance_32): Pass OImode and 256bit vector types
	in AVX register.
	(function_arg_advance_64): Take a new argument to indicate if a
	parameter is named.  Handle 256bit vector types.  Return
	immediately for unnamed 256bit vector mode parameters.
	(function_arg_advance): Updated.
	(function_arg_32): Add comments for TImode.  Handle OImode
	and 256bit vector types.
	(function_arg_64): Take a new argument to indicate if a
	parameter is named.  Handle 256bit vector types.  Return NULL
	for unnamed 256bit vector mode parameters.
	(function_arg): Updated.
	(setup_incoming_varargs_64): Support
	AVX encoding for *sse_prologue_save_insn.
	(ix86_gimplify_va_arg): Handle 256bit vector mode parameters.
	(standard_sse_constant_p): Return -2 for all 1s if SSE2 isn't
	enabled.  For all 1s in 256bit vector modes, return 3 if AVX is
	enabled, otherwise return -3.
	(standard_sse_constant_opcode): Handle AVX and 256bit vector
	modes.
	(print_reg): Support AVX registers.  Handle 'x' and 't'.
	Handle 'd' to duplicate the operand.
	(print_operand): Likewise.  Also support AVX vector compare
	instructions.
	(output_387_binary_op): Support AVX.
	(output_fp_compare): Likewise.
	(ix86_expand_vector_move_misalign): Likewise.
	(ix86_attr_length_vex_default): New.
	(ix86_builtins): Add IX86_BUILTIN_ADDPD256,
	IX86_BUILTIN_ADDPS256, IX86_BUILTIN_ADDSUBPD256,
	IX86_BUILTIN_ADDSUBPS256, IX86_BUILTIN_ANDPD256,
	IX86_BUILTIN_ANDPS256, IX86_BUILTIN_ANDNPD256,
	IX86_BUILTIN_ANDNPS256, IX86_BUILTIN_BLENDPD256,
	IX86_BUILTIN_BLENDPS256, IX86_BUILTIN_BLENDVPD256,
	IX86_BUILTIN_BLENDVPS256, IX86_BUILTIN_DIVPD256,
	IX86_BUILTIN_DIVPS256, IX86_BUILTIN_DPPS256,
	IX86_BUILTIN_HADDPD256, IX86_BUILTIN_HADDPS256,
	IX86_BUILTIN_HSUBPD256, IX86_BUILTIN_HSUBPS256,
	IX86_BUILTIN_MAXPD256, IX86_BUILTIN_MAXPS256,
	IX86_BUILTIN_MINPD256, IX86_BUILTIN_MINPS256,
	IX86_BUILTIN_MULPD256, IX86_BUILTIN_MULPS256,
	IX86_BUILTIN_ORPD256, IX86_BUILTIN_ORPS256,
	IX86_BUILTIN_SHUFPD256, IX86_BUILTIN_SHUFPS256,
	IX86_BUILTIN_SUBPD256, IX86_BUILTIN_SUBPS256,
	IX86_BUILTIN_XORPD256, IX86_BUILTIN_XORPS256,
	IX86_BUILTIN_CMPSD, IX86_BUILTIN_CMPSS, IX86_BUILTIN_CMPPD,
	IX86_BUILTIN_CMPPS, IX86_BUILTIN_CMPPD256,
	IX86_BUILTIN_CMPPS256, IX86_BUILTIN_CVTDQ2PD256,
	IX86_BUILTIN_CVTDQ2PS256, IX86_BUILTIN_CVTPD2PS256,
	IX86_BUILTIN_CVTPS2DQ256, IX86_BUILTIN_CVTPS2PD256,
	IX86_BUILTIN_CVTTPD2DQ256, IX86_BUILTIN_CVTPD2DQ256,
	IX86_BUILTIN_CVTTPS2DQ256, IX86_BUILTIN_EXTRACTF128PD256,
	IX86_BUILTIN_EXTRACTF128PS256, IX86_BUILTIN_EXTRACTF128SI256,
	IX86_BUILTIN_VZEROALL, IX86_BUILTIN_VZEROUPPER,
	IX86_BUILTIN_VZEROUPPER_REX64, IX86_BUILTIN_VPERMILVARPD,
	IX86_BUILTIN_VPERMILVARPS, IX86_BUILTIN_VPERMILVARPD256,
	IX86_BUILTIN_VPERMILVARPS256, IX86_BUILTIN_VPERMILPD,
	IX86_BUILTIN_VPERMILPS, IX86_BUILTIN_VPERMILPD256,
	IX86_BUILTIN_VPERMILPS256, IX86_BUILTIN_VPERMIL2PD,
	IX86_BUILTIN_VPERMIL2PS, IX86_BUILTIN_VPERMIL2PD256,
	IX86_BUILTIN_VPERMIL2PS256, IX86_BUILTIN_VPERM2F128PD256,
	IX86_BUILTIN_VPERM2F128PS256, IX86_BUILTIN_VPERM2F128SI256,
	IX86_BUILTIN_VBROADCASTSS, IX86_BUILTIN_VBROADCASTSD256,
	IX86_BUILTIN_VBROADCASTSS256, IX86_BUILTIN_VBROADCASTPD256,
	IX86_BUILTIN_VBROADCASTPS256, IX86_BUILTIN_VINSERTF128PD256,
	IX86_BUILTIN_VINSERTF128PS256, IX86_BUILTIN_VINSERTF128SI256,
	IX86_BUILTIN_LOADUPD256, IX86_BUILTIN_LOADUPS256,
	IX86_BUILTIN_STOREUPD256, IX86_BUILTIN_STOREUPS256,
	IX86_BUILTIN_LDDQU256, IX86_BUILTIN_LOADDQU256,
	IX86_BUILTIN_STOREDQU256, IX86_BUILTIN_MASKLOADPD,
	IX86_BUILTIN_MASKLOADPS, IX86_BUILTIN_MASKSTOREPD,
	IX86_BUILTIN_MASKSTOREPS, IX86_BUILTIN_MASKLOADPD256,
	IX86_BUILTIN_MASKLOADPS256, IX86_BUILTIN_MASKSTOREPD256,
	IX86_BUILTIN_MASKSTOREPS256, IX86_BUILTIN_MOVSHDUP256,
	IX86_BUILTIN_MOVSLDUP256, IX86_BUILTIN_MOVDDUP256,
	IX86_BUILTIN_SQRTPD256, IX86_BUILTIN_SQRTPS256,
	IX86_BUILTIN_SQRTPS_NR256, IX86_BUILTIN_RSQRTPS256,
	IX86_BUILTIN_RSQRTPS_NR256, IX86_BUILTIN_RCPPS256,
	IX86_BUILTIN_ROUNDPD256, IX86_BUILTIN_ROUNDPS256,
	IX86_BUILTIN_UNPCKHPD256, IX86_BUILTIN_UNPCKLPD256,
	IX86_BUILTIN_UNPCKHPS256, IX86_BUILTIN_UNPCKLPS256,
	IX86_BUILTIN_SI256_SI, IX86_BUILTIN_PS256_PS,
	IX86_BUILTIN_PD256_PD, IX86_BUILTIN_SI_SI256,
	IX86_BUILTIN_PS_PS256, IX86_BUILTIN_PD_PD256,
	IX86_BUILTIN_VTESTZPD, IX86_BUILTIN_VTESTCPD,
	IX86_BUILTIN_VTESTNZCPD, IX86_BUILTIN_VTESTZPS,
	IX86_BUILTIN_VTESTCPS, IX86_BUILTIN_VTESTNZCPS,
	IX86_BUILTIN_VTESTZPD256, IX86_BUILTIN_VTESTCPD256,
	IX86_BUILTIN_VTESTNZCPD256, IX86_BUILTIN_VTESTZPS256,
	IX86_BUILTIN_VTESTCPS256, IX86_BUILTIN_VTESTNZCPS256,
	IX86_BUILTIN_PTESTZ256, IX86_BUILTIN_PTESTC256,
	IX86_BUILTIN_PTESTNZC256, IX86_BUILTIN_MOVMSKPD256
	and IX86_BUILTIN_MOVMSKPS256,
	(ix86_special_builtin_type): Add V32QI_FTYPE_PCCHAR,
	V8SF_FTYPE_PCV4SF, V8SF_FTYPE_PCFLOAT, V4DF_FTYPE_PCV2DF,
	V4DF_FTYPE_PCDOUBLE, V8SF_FTYPE_PCV8SF_V8SF,
	V4DF_FTYPE_PCV4DF_V4DF, V4SF_FTYPE_PCV4SF_V4SF,
	V2DF_FTYPE_PCV2DF_V2DF, VOID_FTYPE_PCHAR_V32QI,
	VOID_FTYPE_PFLOAT_V8SF, VOID_FTYPE_PDOUBLE_V4DF,
	VOID_FTYPE_PV8SF_V8SF_V8SF, VOID_FTYPE_PV4DF_V4DF_V4DF,
	VOID_FTYPE_PV4SF_V4SF_V4SF and VOID_FTYPE_PV2DF_V2DF_V2DF,
	(ix86_builtin_type): Add INT_FTYPE_V8SF_V8SF_PTEST,
	INT_FTYPE_V4DI_V4DI_PTEST, INT_FTYPE_V4DF_V4DF_PTEST,
	INT_FTYPE_V4SF_V4SF_PTEST, INT_FTYPE_V2DF_V2DF_PTEST,
	INT_FTYPE_V8SF, INT_FTYPE_V4DF, V8SI_FTYPE_V8SF, V8SI_FTYPE_V4SI,
	V8SF_FTYPE_V8SF, V8SF_FTYPE_V8SI, V8SF_FTYPE_V4SF,
	V4SI_FTYPE_V8SI, V4SI_FTYPE_V4DF, V4DF_FTYPE_V4DF,
	V4DF_FTYPE_V4SI, V4DF_FTYPE_V4SF, V4DF_FTYPE_V2DF,
	V4SF_FTYPE_V4DF, V4SF_FTYPE_V8SF, V2DF_FTYPE_V4DF,
	V8SF_FTYPE_V8SF_V8SF, V8SF_FTYPE_V8SF_V8SI,
	V4DF_FTYPE_V4DF_V4DF, V4DF_FTYPE_V4DF_V4DI,
	V4SF_FTYPE_V4SF_V4SI, V2DF_FTYPE_V2DF_V2DI,
	V8SF_FTYPE_V8SF_INT, V4SI_FTYPE_V8SI_INT, V4SF_FTYPE_V8SF_INT,
	V2DF_FTYPE_V4DF_INT, V4DF_FTYPE_V4DF_INT,
	V8SF_FTYPE_V8SF_V8SF_V8SF, V4DF_FTYPE_V4DF_V4DF_V4DF,
	V8SI_FTYPE_V8SI_V8SI_INT, V8SF_FTYPE_V8SF_V8SF_INT,
	V4DF_FTYPE_V4DF_V4DF_INT, V4DF_FTYPE_V4DF_V2DF_INT,
	V8SF_FTYPE_V8SF_V8SF_V8SI_INT, V4DF_FTYPE_V4DF_V4DF_V4DI_INT,
	V4SF_FTYPE_V4SF_V4SF_V4SI_INT and V2DF_FTYPE_V2DF_V2DF_V2DI_INT.
	(bdesc_special_args): Add IX86_BUILTIN_VZEROALL,
	IX86_BUILTIN_VZEROUPPER. IX86_BUILTIN_VZEROUPPER_REX64,
	IX86_BUILTIN_VBROADCASTSS, IX86_BUILTIN_VBROADCASTSD256,
	IX86_BUILTIN_VBROADCASTSS256, IX86_BUILTIN_VBROADCASTPD256,
	IX86_BUILTIN_VBROADCASTPS256, IX86_BUILTIN_LOADUPD256,
	IX86_BUILTIN_LOADUPS256, IX86_BUILTIN_STOREUPD256,
	IX86_BUILTIN_STOREUPS256, IX86_BUILTIN_LOADDQU256,
	IX86_BUILTIN_STOREDQU256, IX86_BUILTIN_LDDQU256,
	IX86_BUILTIN_MASKLOADPD, IX86_BUILTIN_MASKLOADPS,
	IX86_BUILTIN_MASKLOADPD256, IX86_BUILTIN_MASKLOADPS256,
	IX86_BUILTIN_MASKSTOREPD, IX86_BUILTIN_MASKSTOREPS,
	IX86_BUILTIN_MASKSTOREPD256 and IX86_BUILTIN_MASKSTOREPS256.
	(ix86_builtins): Add IX86_BUILTIN_ADDPD256,
	IX86_BUILTIN_ADDPS256, IX86_BUILTIN_ADDSUBPD256,
	IX86_BUILTIN_ADDSUBPS256, IX86_BUILTIN_ANDPD256,
	IX86_BUILTIN_ANDPS256, IX86_BUILTIN_ANDNPD256,
	IX86_BUILTIN_ANDNPS256, IX86_BUILTIN_DIVPD256,
	IX86_BUILTIN_DIVPS256, IX86_BUILTIN_HADDPD256,
	IX86_BUILTIN_HSUBPS256, IX86_BUILTIN_HSUBPD256,
	IX86_BUILTIN_HADDPS256, IX86_BUILTIN_MAXPD256,
	IX86_BUILTIN_MAXPS256, IX86_BUILTIN_MINPD256,
	IX86_BUILTIN_MINPS256, IX86_BUILTIN_MULPD256,
	IX86_BUILTIN_MULPS256, IX86_BUILTIN_ORPD256,
	IX86_BUILTIN_ORPS256, IX86_BUILTIN_SUBPD256,
	IX86_BUILTIN_SUBPS256, IX86_BUILTIN_XORPD256,
	IX86_BUILTIN_XORPS256, IX86_BUILTIN_VPERMILVARPD,
	IX86_BUILTIN_VPERMILVARPS, IX86_BUILTIN_VPERMILVARPD256,
	IX86_BUILTIN_VPERMILVARPS256, IX86_BUILTIN_BLENDPD256,
	IX86_BUILTIN_BLENDPS256, IX86_BUILTIN_BLENDVPD256,
	IX86_BUILTIN_BLENDVPS256, IX86_BUILTIN_DPPS256,
	IX86_BUILTIN_SHUFPD256, IX86_BUILTIN_SHUFPS256,
	IX86_BUILTIN_CMPSD, IX86_BUILTIN_CMPSS, IX86_BUILTIN_CMPPD,
	IX86_BUILTIN_CMPPS,
	IX86_BUILTIN_CMPPD256,IX86_BUILTIN_CMPPS256,
	IX86_BUILTIN_EXTRACTF128PD256, IX86_BUILTIN_EXTRACTF128PS256,
	IX86_BUILTIN_EXTRACTF128SI256, IX86_BUILTIN_CVTDQ2PD256,
	IX86_BUILTIN_CVTDQ2PS256, IX86_BUILTIN_CVTPD2PS256,
	IX86_BUILTIN_CVTPS2DQ256, IX86_BUILTIN_CVTPS2PD256,
	IX86_BUILTIN_CVTTPD2DQ256, IX86_BUILTIN_CVTPD2DQ256,
	IX86_BUILTIN_CVTTPS2DQ256, IX86_BUILTIN_VPERM2F128PD256,
	IX86_BUILTIN_VPERM2F128PS256, IX86_BUILTIN_VPERM2F128SI256,
	IX86_BUILTIN_VPERMILPD, IX86_BUILTIN_VPERMILPS,
	IX86_BUILTIN_VPERMILPD256, IX86_BUILTIN_VPERMILPS256,
	IX86_BUILTIN_VPERMIL2PD, IX86_BUILTIN_VPERMILPS,
	IX86_BUILTIN_VPERMILPD256, IX86_BUILTIN_VPERMILPS256,
	IX86_BUILTIN_VPERMIL2PD, IX86_BUILTIN_VPERMIL2PS,
	IX86_BUILTIN_VPERMIL2PD256, IX86_BUILTIN_VPERMIL2PS256,
	IX86_BUILTIN_VINSERTF128PD256, IX86_BUILTIN_VINSERTF128PS256,
	IX86_BUILTIN_VINSERTF128SI256, IX86_BUILTIN_MOVSHDUP256,
	IX86_BUILTIN_MOVSLDUP256, IX86_BUILTIN_MOVDDUP256,
	IX86_BUILTIN_SQRTPD256, IX86_BUILTIN_SQRTPS256,
	IX86_BUILTIN_SQRTPS_NR256, IX86_BUILTIN_RSQRTPS256,
	IX86_BUILTIN_RSQRTPS_NR256, IX86_BUILTIN_RCPPS256,
	IX86_BUILTIN_ROUNDPD256, IX86_BUILTIN_ROUNDPS256,
	IX86_BUILTIN_UNPCKHPD256, IX86_BUILTIN_UNPCKLPD256,
	IX86_BUILTIN_UNPCKHPS256, IX86_BUILTIN_UNPCKLPS256,
	IX86_BUILTIN_SI256_SI, IX86_BUILTIN_PS256_PS,
	IX86_BUILTIN_PD256_PD, IX86_BUILTIN_SI_SI256,
	IX86_BUILTIN_PS_PS256, IX86_BUILTIN_PD_PD256,
	IX86_BUILTIN_VTESTZPD, IX86_BUILTIN_VTESTCPD,
	IX86_BUILTIN_VTESTNZCPD, IX86_BUILTIN_VTESTZPS,
	IX86_BUILTIN_VTESTCPS, IX86_BUILTIN_VTESTNZCPS,
	IX86_BUILTIN_VTESTZPD256, IX86_BUILTIN_VTESTCPD256,
	IX86_BUILTIN_VTESTNZCPD256, IX86_BUILTIN_VTESTZPS256,
	IX86_BUILTIN_VTESTCPS256, IX86_BUILTIN_VTESTNZCPS256,
	IX86_BUILTIN_PTESTZ256, IX86_BUILTIN_PTESTC256,
	IX86_BUILTIN_PTESTNZC256, IX86_BUILTIN_MOVMSKPD256 and
	IX86_BUILTIN_MOVMSKPS256.
	(ix86_init_mmx_sse_builtins): Support AVX builtins.
	(ix86_expand_args_builtin): Likewise.
	(ix86_expand_special_args_builtin): Likewise.
	(ix86_hard_regno_mode_ok): Handle AVX modes.
	(ix86_expand_vector_init_duplicate): Likewise.
	(ix86_expand_vector_init_one_nonzero): Likewise.
	(ix86_expand_vector_init_one_var): Likewise.
	(ix86_expand_vector_init_concat): Likewise.
	(ix86_expand_vector_init_general): Likewise.
	(ix86_expand_vector_set): Likewise.
	(ix86_vector_mode_supported_p): Likewise.
	(x86_extended_reg_mentioned_p): Check INSN_P before using
	PATTERN.

	* config/i386/i386-c.c (ix86_target_macros_internal): Handle
	OPTION_MASK_ISA_AVX and OPTION_MASK_ISA_FMA.

	* config/i386/i386.h (TARGET_AVX): New.
	(TARGET_FMA): Likewise.
	(TARGET_CPU_CPP_BUILTINS): Handle TARGET_AVX and TARGET_FMA.
	(BIGGEST_ALIGNMENT): Set to 256 for TARGET_AVX.
	(VALID_AVX256_REG_MODE): New.
	(AVX256_VEC_FLOAT_MODE_P): Likewise.
	(AVX_FLOAT_MODE_P): Likewise.
	(AVX128_VEC_FLOAT_MODE_P): Likewise.
	(AVX256_VEC_FLOAT_MODE_P): Likewise.
	(AVX_VEC_FLOAT_MODE_P): Likewise.
	(ASM_OUTPUT_AVX_PREFIX): Likewise.
	(ASM_OUTPUT_OPCODE): Likewise.
	(UNITS_PER_SIMD_WORD): Add a FIXME for 32byte vectorizer
	support.
	(SSE_REG_MODE_P): Allow 256bit vector modes.
	(ix86_args): Add a warn_avx field.

	* config/i386/i386.md (UNSPEC_PCMP): New.
	(UNSPEC_VPERMIL): Likewise.
	(UNSPEC_VPERMIL2): Likewise.
	(UNSPEC_VPERMIL2F128): Likewise.
	(UNSPEC_MASKLOAD): Likewise.
	(UNSPEC_MASKSTORE): Likewise.
	(UNSPEC_CAST): Likewise.
	(UNSPEC_VTESTP): Likewise.
	(UNSPECV_VZEROALL): Likewise.
	(UNSPECV_VZEROUPPER): Likewise.
	(XMM0_REG): Likewise.
	(XMM1_REG): Likewise.
	(XMM2_REG): Likewise.
	(XMM3_REG): Likewise.
	(XMM4_REG): Likewise.
	(XMM5_REG): Likewise.
	(XMM6_REG): Likewise.
	(XMM8_REG): Likewise.
	(XMM9_REG): Likewise.
	(XMM10_REG): Likewise.
	(XMM11_REG): Likewise.
	(XMM12_REG): Likewise.
	(XMM13_REG): Likewise.
	(XMM14_REG): Likewise.
	(XMM15_REG): Likewise.
	(prefix): Likewise.
	(prefix_vex_imm8): Likewise.
	(prefix_vex_w): Likewise.
	(length_vex): Likewise.
	(maxmin): Likewise.
	(movoi): Likewise.
	(*avx_ashlti3): Likewise.
	(*avx_lshrti3): Likewise.
	(*avx_setcc<mode>): Likewise.
	(*fop_<mode>_comm_mixed_avx): Likewise.
	(*fop_<mode>_comm_avx): Likewise.
	(*fop_<mode>_1_mixed_avx): Likewise.
	(*fop_<mode>_1_avx): Likewise.
	(*avx_<code><mode>3): Likewise.
	(*avx_ieee_smin<mode>3): Likewise.
	(*avx_ieee_smax<mode>3): Likewise.
	(mode): Add OI, V8SF and V4DF.
	(length): Support VEX prefix.
	(*cmpfp_i_mixed): Set prefix attribute.
	(*cmpfp_i_sse): Likewise.
	(*cmpfp_iu_mixed): Likewise.
	(*cmpfp_iu_sse): Likewise.
	(*movsi_1): Support AVX.
	(*movdi_2): Likewise.
	(*movdi_1_rex64): Likewise.
	(*movti_internal): Likewise.
	(*movti_rex64): Likewise.
	(*movsf_1): Likewise.
	(*movdf_nointeger): Likewise.
	(*movdf_integer_rex64): Likewise.
	(*movtf_internal): Likewise.
	(zero_extendsidi2_32): Likewise.
	(zero_extendsidi2_rex64): Likewise.
	(*extendsfdf2_mixed): Likewise.
	(*extendsfdf2_sse): Likewise.
	(*truncdfsf_fast_mixed): Likewise.
	(*truncdfsf_fast_sse): Likewise.
	(*truncdfsf_mixed): Likewise.
	(fix_trunc<mode>di_sse): Likewise.
	(fix_trunc<mode>si_sse): Likewise.
	(*float<SSEMODEI24:mode><MODEF:mode>2_mixed_interunit): Likewise.
	(*float<SSEMODEI24:mode><MODEF:mode>2_mixed_nointerunit): Likewise.
	(*float<SSEMODEI24:mode><MODEF:mode>2_sse_interunit): Likewise.
	(*float<SSEMODEI24:mode><MODEF:mode>2_sse_nointerunit): Likewise.
	(*rcpsf2_sse): Likewise.
	(*rsqrtsf2_sse): Likewise.
	(*sqrt<mode>2_sse): Likewise.
	(sse4_1_round<mode>2): Likewise.
	(*sse_prologue_save_insn): Disallow REX prefix for AVX.
	Support AVX.  Set length attribute properly for AVX.

	* config/i386/i386-modes.def (VECTOR_MODES (INT, 32)): New.
	(VECTOR_MODES (FLOAT, 32)): Likewise.
	(VECTOR_MODE (INT, DI, 8)): Likewise.
	(VECTOR_MODE (INT, HI, 32)): Likewise.
	(VECTOR_MODE (INT, QI, 64)): Likewise.
	(VECTOR_MODE (FLOAT, DF, 8)): Likewise.
	(VECTOR_MODE (FLOAT, SF, 16)): Likewise.
	(VECTOR_MODE (INT, DI, 4)): Removed.
	(VECTOR_MODE (INT, SI, 8)): Likewise.
	(VECTOR_MODE (INT, HI, 16)): Likewise.
	(VECTOR_MODE (INT, QI, 32)): Likewise.
	(VECTOR_MODE (FLOAT, SF, 8)): Likewise.
	(INT_MODE (OI, 32)): Likewise.

	* config/i386/i386.opt (mavx): New.
	(mfma): Likewise.

	* config/i386/i386-protos.h (ix86_attr_length_vex_default): New.

	* config/i386/mmx.md (*mov<mode>_internal_rex64): Support AVX.
	(*mov<mode>_internal_avx): New.
	(*movv2sf_internal_rex64_avx): Likewise.
	(*movv2sf_internal_avx): Likewise.

	* config/i386/predicates.md (const_4_to_5_operand): New.
	(const_6_to_7_operand): Likewise.
	(const_8_to_11_operand): Likewise.
	(const_12_to_15_operand): Likewise.
	(avx_comparison_float_operator): Likewise.

	* config/i386/sse.md (AVX256MODEI): New.
	(AVX256MODE): Likewise.
	(AVXMODEQI): Likewise.
	(AVXMODE): Likewise.
	(AVX256MODEF2P): Likewise.
	(AVX256MODE2P): Likewise.
	(AVX256MODE4P): Likewise.
	(AVX256MODE8P): Likewise.
	(AVXMODEF2P): Likewise.
	(AVXMODEF4P): Likewise.
	(AVXMODEDCVTDQ2PS): Likewise.
	(AVXMODEDCVTPS2DQ): Likewise.
	(avxvecmode): Likewise.
	(avxvecpsmode): Likewise.
	(avxhalfvecmode): Likewise.
	(avxscalarmode): Likewise.
	(avxcvtvecmode): Likewise.
	(avxpermvecmode): Likewise.
	(avxmodesuffixf2c): Likewise.
	(avxmodesuffixp): Likewise.
	(avxmodesuffixs): Likewise.
	(avxmodesuffix): Likewise.
	(vpermilbits): Likewise.
	(pinsrbits): Likewise.
	(mov<mode>): Likewise.
	(*mov<mode>_internal): Likewise.
	(push<mode>1): Likewise.
	(movmisalign<mode>): Likewise.
	(avx_movup<avxmodesuffixf2c><avxmodesuffix>): Likewise.
	(avx_movdqu<avxmodesuffix>): Likewise.
	(avx_lddqu<avxmodesuffix>): Likewise.
	(<plusminus_insn><mode>3): Likewise.
	(*avx_<plusminus_insn><mode>3): Likewise.
	(*avx_vm<plusminus_insn><mode>3): Likewise.
	(mul<mode>3): Likewise.
	(*avx_mul<mode>3): Likewise.
	(*avx_vmmul<mode>3): Likewise.
	(divv8sf3): Likewise.
	(divv4df3): Likewise.
	(avx_div<mode>3): Likewise.
	(*avx_div<mode>3): Likewise.
	(*avx_vmdiv<mode>3): Likewise.
	(avx_rcpv8sf2): Likewise.
	(*avx_vmrcpv4sf2): Likewise.
	(sqrtv8sf2): Likewise.
	(avx_sqrtv8sf2): Likewise.
	(*avx_vmsqrt<mode>2): Likewise.
	(rsqrtv8sf2): Likewise.
	(avx_rsqrtv8sf2): Likewise.
	(*avx_vmrsqrtv4sf2): Likewise.
	(<code><mode>3): Likewise.
	(*avx_<code><mode>3_finite): Likewise.
	(*avx_<code><mode>3): Likewise.
	(*avx_vm<code><mode>3): Likewise.
	(*avx_ieee_smin<mode>3): Likewise.
	(*avx_ieee_smax<mode>3): Likewise.
	(avx_addsubv8sf3): Likewise.
	(avx_addsubv4df3): Likewise.
	(*avx_addsubv4sf3): Likewise.
	(*avx_addsubv2df3): Likewise.
	(avx_h<plusminus_insn>v4df3): Likewise.
	(avx_h<plusminus_insn>v8sf3): Likewise.
	(*avx_h<plusminus_insn>v4sf3): Likewise.
	(*avx_h<plusminus_insn>v2df3): Likewise.
	(avx_cmpp<avxmodesuffixf2c><mode>3): Likewise.
	(avx_cmps<ssemodesuffixf2c><mode>3): Likewise.
	(*avx_maskcmp<mode>3): Likewise.
	(avx_nand<mode>3): Likewise.
	(*avx_<code><mode>3): Likewise.
	(*avx_nand<mode>3): Likewise.
	(*avx_<code><mode>3): Likewise.
	(*avx_cvtsi2ss): Likewise.
	(*avx_cvtsi2ssq): Likewise.
	(*avx_cvtsi2sd): Likewise.
	(*avx_cvtsi2sdq): Likewise.
	(*avx_cvtsd2ss): Likewise.
	(avx_cvtss2sd): Likewise.
	(avx_cvtdq2ps<avxmodesuffix>): Likewise.
	(avx_cvtps2dq<avxmodesuffix>): Likewise.
	(avx_cvttps2dq<avxmodesuffix>): Likewise.
	(*avx_cvtsi2sd): Likewise.
	(*avx_cvtsi2sdq): Likewise.
	(avx_cvtdq2pd256): Likewise.
	(avx_cvtpd2dq256): Likewise.
	(avx_cvttpd2dq256): Likewise.
	(*avx_cvtsd2ss): Likewise.
	(*avx_cvtss2sd): Likewise.
	(avx_cvtpd2ps256): Likewise.
	(avx_cvtps2pd256): Likewise.
	(*avx_movhlps): Likewise.
	(*avx_movlhps): Likewise.
	(avx_unpckhps256): Likewise.
	(*avx_unpckhps): Likewise.
	(avx_unpcklps256): Likewise.
	(*avx_unpcklps): Likewise.
	(avx_movshdup256): Likewise.
	(avx_movsldup256): Likewise.
	(avx_shufps256): Likewise.
	(avx_shufps256_1): Likewise.
	(*avx_shufps_<mode>): Likewise.
	(*avx_loadhps): Likewise.
	(*avx_storelps): Likewise.
	(*avx_loadlps): Likewise.
	(*avx_movss): Likewise.
	(*vec_dupv4sf_avx): Likewise.
	(*vec_concatv2sf_avx): Likewise.
	(*vec_concatv4sf_avx): Likewise.
	(*vec_setv4sf_0_avx): Likewise.
	(*vec_setv4sf_avx): Likewise.
	(*avx_insertps): Likewise.
	(avx_vextractf128<mode>): Likewise.
	(vec_extract_lo_<mode>): Likewise.
	(vec_extract_hi_<mode>): Likewise.
	(vec_extract_lo_<mode>): Likewise.
	(vec_extract_hi_<mode>): Likewise.
	(vec_extract_lo_v16hi): Likewise.
	(vec_extract_hi_v16hi): Likewise.
	(vec_extract_lo_v32qi): Likewise.
	(vec_extract_hi_v32qi): Likewise.
	(avx_unpckhpd256): Likewise.
	(*avx_unpckhpd): Likewise.
	(avx_movddup256): Likewise.
	(*avx_movddup): Likewise.
	(avx_unpcklpd256): Likewise.
	(*avx_unpcklpd): Likewise.
	(avx_shufpd256): Likewise.
	(avx_shufpd256_1): Likewise.
	(*avx_punpckhqdq): Likewise.
	(*avx_punpcklqdq): Likewise.
	(*avx_shufpd_<mode>): Likewise.
	(*avx_storehpd): Likewise.
	(*avx_loadhpd): Likewise.
	(*avx_loadlpd): Likewise.
	(*avx_movsd): Likewise.
	(*vec_concatv2df_avx): Likewise.
	(*avx_<plusminus_insn><mode>3): Likewise.
	(*avx_<plusminus_insn><mode>3): Likewise.
	(*avx_mulv8hi3): Likewise.
	(*avxv8hi3_highpart): Likewise.
	(*avx_umulv8hi3_highpart): Likewise.
	(*avx_umulv2siv2di3): Likewise.
	(*avx_mulv2siv2di3): Likewise.
	(*avx_pmaddwd): Likewise.
	(*avx_mulv4si3): Likewise.
	(*avx_ashr<mode>3): Likewise.
	(*avx_lshr<mode>3): Likewise.
	(*avx_ashl<mode>3): Likewise.
	(*avx_<code><mode>3): Likewise.
	(*avx_eq<mode>3): Likewise.
	(*avx_gt<mode>3): Likewise.
	(*avx_nand<mode>3): Likewise.
	(*avx_nand<mode>3): Likewise.
	(*avx_<code><mode>3): Likewise.
	(*avx_<code><mode>3): Likewise.
	(*avx_packsswb): Likewise.
	(*avx_packssdw): Likewise.
	(*avx_packuswb): Likewise.
	(*avx_punpckhbw): Likewise.
	(*avx_punpcklbw): Likewise.
	(*avx_punpckhwd): Likewise.
	(*avx_punpcklwd): Likewise.
	(*avx_punpckhdq): Likewise.
	(*avx_punpckldq): Likewise.
	(*avx_pinsr<avxmodesuffixs>): Likewise.
	(*avx_pinsrq): Likewise.
	(*avx_loadld): Likewise.
	(*vec_extractv2di_1_rex64_avx): Likewise.
	(*vec_extractv2di_1_avx): Likewise.
	(*vec_dupv2di_avx): Likewise.
	(*vec_concatv2si_avx): Likewise.
	(*vec_concatv4si_1_avx): Likewise.
	(*vec_concatv2di_avx): Likewise.
	(*vec_concatv2di_rex64_avx): Likewise.
	(*avx_uavgv16qi3): Likewise.
	(*avx_uavgv8hi3): Likewise.
	(*avx_psadbw): Likewise.
	(avx_movmskp<avxmodesuffixf2c>256): Likewise.
	(*avx_phaddwv8hi3): Likewise.
	(*avx_phadddv4si3): Likewise.
	(*avx_phaddswv8hi3): Likewise.
	(*avx_phsubwv8hi3): Likewise.
	(*avx_phsubdv4si3): Likewise.
	(*avx_phsubswv8hi3): Likewise.
	(*avx_pmaddubsw128): Likewise.
	(*avx_pmulhrswv8hi3): Likewise.
	(*avx_pshufbv16qi3): Likewise.
	(*avx_psign<mode>3): Likewise.
	(*avx_palignrti): Likewise.
	(avx_blendp<avxmodesuffixf2c><avxmodesuffix>): Likewise.
	(avx_blendvp<avxmodesuffixf2c><avxmodesuffix>): Likewise.
	(avx_dpp<avxmodesuffixf2c><avxmodesuffix>): Likewise.
	(*avx_mpsadbw): Likewise.
	(*avx_packusdw): Likewise.
	(*avx_pblendvb): Likewise.
	(*avx_pblendw): Likewise.
	(avx_vtestp<avxmodesuffixf2c><avxmodesuffix>): Likewise.
	(avx_ptest256): Likewise.
	(avx_roundp<avxmodesuffixf2c>256): Likewise.
	(*avx_rounds<ssemodesuffixf2c>): Likewise.
	(*avx_aesenc): Likewise.
	(*avx_aesenclast): Likewise.
	(*avx_aesdec): Likewise.
	(*avx_aesdeclast): Likewise.
	(avx_vzeroupper): Likewise.
	(avx_vzeroupper_rex64): Likewise.
	(avx_vpermil<mode>): Likewise.
	(avx_vpermilvar<mode>3): Likewise.
	(avx_vpermil2<mode>3): Likewise.
	(avx_vperm2f128<mode>3): Likewise.
	(avx_vbroadcasts<avxmodesuffixf2c><avxmodesuffix>): Likewise.
	(avx_vbroadcastss256): Likewise.
	(avx_vbroadcastf128_p<avxmodesuffixf2c>256): Likewise.
	(avx_vinsertf128<mode>): Likewise.
	(vec_set_lo_<mode>): Likewise.
	(vec_set_hi_<mode>): Likewise.
	(vec_set_lo_<mode>): Likewise.
	(vec_set_hi_<mode>): Likewise.
	(vec_set_lo_v16hi): Likewise.
	(vec_set_hi_v16hi): Likewise.
	(vec_set_lo_v32qi): Likewise.
	(vec_set_hi_v32qi): Likewise.
	(avx_maskloadp<avxmodesuffixf2c><avxmodesuffix>): Likewise.
	(avx_maskstorep<avxmodesuffixf2c><avxmodesuffix>): Likewise.
	(avx_<avxmodesuffixp><avxmodesuffix>_<avxmodesuffixp>): Likewise.
	(avx_<avxmodesuffixp>_<avxmodesuffixp><avxmodesuffix>): Likewise.
	(vec_init<mode>): Likewise.
	(*vec_concat<mode>_avx): Likewise.
	(blendbits): Support V8SF and V4DF.
	(sse2_movq128): Support AVX.
	(<sse>_movnt<mode>): Likewise.
	(sse2_movntv2di): Likewise.
	(sse_rcpv4sf2): Likewise.
	(sse_sqrtv4sf2): Likewise.
	(sse_rsqrtv4sf2): Likewise.
	(<sse>_comi): Likewise.
	(<sse>_ucomi): Likewise.
	(sse_cvtss2si): Likewise.
	(sse_cvtss2si_2): Likewise.
	(sse_cvtss2siq): Likewise.
	(sse_cvtss2siq_2): Likewise.
	(sse_cvttss2si): Likewise.
	(sse_cvttss2siq): Likewise.
	(sse2_cvtsd2si): Likewise.
	(sse2_cvtsd2si_2): Likewise.
	(sse2_cvtsd2siq): Likewise.
	(sse2_cvtsd2siq_2): Likewise.
	(sse2_cvttsd2si): Likewise.
	(sse2_cvttsd2siq): Likewise.
	(sse2_cvtdq2pd): Likewise.
	(*sse2_cvtpd2dq): Likewise.
	(*sse2_cvttpd2dq): Likewise.
	(*sse2_cvtpd2ps): Likewise.
	(sse2_cvtps2pd): Likewise.
	(sse3_movshdup): Likewise.
	(sse3_movsldup): Likewise.
	(sse_storehps): Likewise.
	(*sse4_1_extractps): Likewise.
	(sse2_storelpd): Likewise.
	(vec_dupv2df_sse3): Likewise.
	(*vec_concatv2df_sse3): Likewise.
	(*sse4_1_pextrb): Likewise.
	(*sse4_1_pextrb_memory): Likewise.
	(*sse2_pextrw): Likewise.
	(*sse4_1_pextrw_memory): Likewise.
	(*sse4_1_pextrd): Likewise.
	(*sse4_1_pextrq): Likewise.
	(sse2_pshufd_1): Likewise.
	(sse2_pshuflw_1): Likewise.
	(sse2_pshufhw_1): Likewise.
	(*sse2_storeq_rex64): Likewise.
	(*vec_dupv4si): Likewise.
	(<sse>_movmskp<ssemodesuffixf2c>): Likewise.
	(sse2_pmovmskb): Likewise.
	(*sse2_maskmovdqu): Likewise.
	(*sse2_maskmovdqu_rex64): Likewise.
	(sse_ldmxcsr): Likewise.
	(sse_stmxcsr): Likewise.
	(abs<mode>2): Likewise.
	(sse4_1_movntdqa): Likewise.
	(sse4_1_phminposuw): Likewise.
	(sse4_1_extendv8qiv8hi2): Likewise.
	(*sse4_1_extendv8qiv8hi2): Likewise.
	(sse4_1_extendv4qiv4si2): Likewise.
	(*sse4_1_extendv4qiv4si2): Likewise.
	(sse4_1_extendv2qiv2di2): Likewise.
	(*sse4_1_extendv2qiv2di2): Likewise.
	(sse4_1_extendv4hiv4si2): Likewise.
	(*sse4_1_extendv4hiv4si2): Likewise.
	(sse4_1_extendv2hiv2di2): Likewise.
	(*sse4_1_extendv2hiv2di2): Likewise.
	(sse4_1_extendv2siv2di2): Likewise.
	(*sse4_1_extendv2siv2di2): Likewise.
	(sse4_1_zero_extendv8qiv8hi2): Likewise.
	(*sse4_1_zero_extendv8qiv8hi2): Likewise.
	(sse4_1_zero_extendv4qiv4si2): Likewise.
	(*sse4_1_zero_extendv4qiv4si2): Likewise.
	(sse4_1_zero_extendv2qiv2di2): Likewise.
	(*sse4_1_zero_extendv2qiv2di2): Likewise.
	(sse4_1_zero_extendv4hiv4si2): Likewise.
	(*sse4_1_zero_extendv4hiv4si2): Likewise.
	(sse4_1_zero_extendv2hiv2di2): Likewise.
	(*sse4_1_zero_extendv2hiv2di2): Likewise.
	(sse4_1_zero_extendv2siv2di2): Likewise.
	(*sse4_1_zero_extendv2siv2di2): Likewise.
	(sse4_1_ptest): Likewise.
	(sse4_1_roundp<ssemodesuffixf2c>): Likewise.
	(sse4_2_pcmpestri): Likewise.
	(sse4_2_pcmpestrm): Likewise.
	(sse4_2_pcmpistri): Likewise.
	(sse4_2_pcmpistrm): Likewise.
	(aesimc): Likewise.
	(aeskeygenassist): Likewise.

2008-08-28  Uros Bizjak  <ubizjak@gmail.com>

	* config/i386/predicates.md (vzeroall_operation): New.

	* config/i386/sse.md (avx_vzeroall): New.
	(*avx_vzeroall): Likewise.

From-SVN: r139726
---
 gcc/ChangeLog                  |  687 +++++++-
 gcc/config.gcc                 |    4 +-
 gcc/config/i386/cpuid.h        |    4 +
 gcc/config/i386/gas.h          |    3 +
 gcc/config/i386/gmmintrin.h    | 1482 +++++++++++++++++
 gcc/config/i386/i386-c.c       |    4 +
 gcc/config/i386/i386-modes.def |   15 +-
 gcc/config/i386/i386-protos.h  |    1 +
 gcc/config/i386/i386.c         | 1627 +++++++++++++++++-
 gcc/config/i386/i386.h         |   59 +-
 gcc/config/i386/i386.md        |  510 +++++-
 gcc/config/i386/i386.opt       |    8 +
 gcc/config/i386/mmx.md         |   96 +-
 gcc/config/i386/predicates.md  |   37 +
 gcc/config/i386/sse.md         | 3576 ++++++++++++++++++++++++++++++++++++++--
 15 files changed, 7808 insertions(+), 305 deletions(-)
 create mode 100644 gcc/config/i386/gmmintrin.h

diff --git a/gcc/ChangeLog b/gcc/ChangeLog
index 0f652e6..b326cc6 100644
--- a/gcc/ChangeLog
+++ b/gcc/ChangeLog
@@ -1,6 +1,683 @@
+2008-08-28  H.J. Lu  <hongjiu.lu@intel.com>
+	    Joey Ye  <joey.ye@intel.com>
+	    Xuepeng Guo  <xuepeng.guo@intel.com>
+
+	* config.gcc (extra_headers): Add gmmintrin.h for x86 and x86-64.
+
+	* config/i386/cpuid.h (bit_FMA): New.
+	(bit_XSAVE): Likewise.
+	(bit_OSXSAVE): Likewise.
+	(bit_AVX): Likewise.
+
+	* config/i386/gas.h (ASM_OUTPUT_OPCODE): Undefine before
+	define.  Use ASM_OUTPUT_AVX_PREFIX.
+
+	* config/i386/gmmintrin.h: New.
+
+	* config/i386/i386.c (x86_64_reg_class): Add X86_64_AVX_CLASS.
+	(OPTION_MASK_ISA_AVX_SET): New.
+	(OPTION_MASK_ISA_FMA_SET): Likewise.
+	(OPTION_MASK_ISA_AVX_UNSET): Likewise.
+	(OPTION_MASK_ISA_FMA_SET): Likewise.
+	(OPTION_MASK_ISA_SSE4_2_UNSET): Updated.
+	(ix86_handle_option): Handle OPT_mavx and OPT_mfma.
+	(pta_flags): Add PTA_AVX and PTA_FMA.
+	(override_options): Handle PTA_AVX and PTA_FMA.
+	(init_cumulative_args): Handle warn_avx.
+	(classify_argument): Return 0 for COImode and OImode.  Return
+	1 and X86_64_AVX_CLASS for 256bit vector types.
+	(examine_argument): Handle X86_64_AVX_CLASS.
+	(construct_container): Likewise.
+	(function_arg_advance_32): Pass OImode and 256bit vector types
+	in AVX register.
+	(function_arg_advance_64): Take a new argument to indicate if a
+	parameter is named.  Handle 256bit vector types.  Return
+	immediately for unnamed 256bit vector mode parameters.
+	(function_arg_advance): Updated.
+	(function_arg_32): Add comments for TImode.  Handle OImode
+	and 256bit vector types.
+	(function_arg_64): Take a new argument to indicate if a
+	parameter is named.  Handle 256bit vector types.  Return NULL
+	for unnamed 256bit vector mode parameters.
+	(function_arg): Updated.
+	(setup_incoming_varargs_64): Support
+	AVX encoding for *sse_prologue_save_insn.
+	(ix86_gimplify_va_arg): Handle 256bit vector mode parameters.
+	(standard_sse_constant_p): Return -2 for all 1s if SSE2 isn't
+	enabled.  For all 1s in 256bit vector modes, return 3 if AVX is
+	enabled, otherwise return -3.
+	(standard_sse_constant_opcode): Handle AVX and 256bit vector
+	modes.
+	(print_reg): Support AVX registers.  Handle 'x' and 't'.
+	Handle 'd' to duplicate the operand.
+	(print_operand): Likewise.  Also support AVX vector compare
+	instructions.
+	(output_387_binary_op): Support AVX.
+	(output_fp_compare): Likewise.
+	(ix86_expand_vector_move_misalign): Likewise.
+	(ix86_attr_length_vex_default): New.
+	(ix86_builtins): Add IX86_BUILTIN_ADDPD256,
+	IX86_BUILTIN_ADDPS256, IX86_BUILTIN_ADDSUBPD256,
+	IX86_BUILTIN_ADDSUBPS256, IX86_BUILTIN_ANDPD256,
+	IX86_BUILTIN_ANDPS256, IX86_BUILTIN_ANDNPD256,
+	IX86_BUILTIN_ANDNPS256, IX86_BUILTIN_BLENDPD256,
+	IX86_BUILTIN_BLENDPS256, IX86_BUILTIN_BLENDVPD256,
+	IX86_BUILTIN_BLENDVPS256, IX86_BUILTIN_DIVPD256,
+	IX86_BUILTIN_DIVPS256, IX86_BUILTIN_DPPS256,
+	IX86_BUILTIN_HADDPD256, IX86_BUILTIN_HADDPS256,
+	IX86_BUILTIN_HSUBPD256, IX86_BUILTIN_HSUBPS256,
+	IX86_BUILTIN_MAXPD256, IX86_BUILTIN_MAXPS256,
+	IX86_BUILTIN_MINPD256, IX86_BUILTIN_MINPS256,
+	IX86_BUILTIN_MULPD256, IX86_BUILTIN_MULPS256,
+	IX86_BUILTIN_ORPD256, IX86_BUILTIN_ORPS256,
+	IX86_BUILTIN_SHUFPD256, IX86_BUILTIN_SHUFPS256,
+	IX86_BUILTIN_SUBPD256, IX86_BUILTIN_SUBPS256,
+	IX86_BUILTIN_XORPD256, IX86_BUILTIN_XORPS256,
+	IX86_BUILTIN_CMPSD, IX86_BUILTIN_CMPSS, IX86_BUILTIN_CMPPD,
+	IX86_BUILTIN_CMPPS, IX86_BUILTIN_CMPPD256,
+	IX86_BUILTIN_CMPPS256, IX86_BUILTIN_CVTDQ2PD256,
+	IX86_BUILTIN_CVTDQ2PS256, IX86_BUILTIN_CVTPD2PS256,
+	IX86_BUILTIN_CVTPS2DQ256, IX86_BUILTIN_CVTPS2PD256,
+	IX86_BUILTIN_CVTTPD2DQ256, IX86_BUILTIN_CVTPD2DQ256,
+	IX86_BUILTIN_CVTTPS2DQ256, IX86_BUILTIN_EXTRACTF128PD256,
+	IX86_BUILTIN_EXTRACTF128PS256, IX86_BUILTIN_EXTRACTF128SI256,
+	IX86_BUILTIN_VZEROALL, IX86_BUILTIN_VZEROUPPER,
+	IX86_BUILTIN_VZEROUPPER_REX64, IX86_BUILTIN_VPERMILVARPD,
+	IX86_BUILTIN_VPERMILVARPS, IX86_BUILTIN_VPERMILVARPD256,
+	IX86_BUILTIN_VPERMILVARPS256, IX86_BUILTIN_VPERMILPD,
+	IX86_BUILTIN_VPERMILPS, IX86_BUILTIN_VPERMILPD256,
+	IX86_BUILTIN_VPERMILPS256, IX86_BUILTIN_VPERMIL2PD,
+	IX86_BUILTIN_VPERMIL2PS, IX86_BUILTIN_VPERMIL2PD256,
+	IX86_BUILTIN_VPERMIL2PS256, IX86_BUILTIN_VPERM2F128PD256,
+	IX86_BUILTIN_VPERM2F128PS256, IX86_BUILTIN_VPERM2F128SI256,
+	IX86_BUILTIN_VBROADCASTSS, IX86_BUILTIN_VBROADCASTSD256,
+	IX86_BUILTIN_VBROADCASTSS256, IX86_BUILTIN_VBROADCASTPD256,
+	IX86_BUILTIN_VBROADCASTPS256, IX86_BUILTIN_VINSERTF128PD256,
+	IX86_BUILTIN_VINSERTF128PS256, IX86_BUILTIN_VINSERTF128SI256,
+	IX86_BUILTIN_LOADUPD256, IX86_BUILTIN_LOADUPS256,
+	IX86_BUILTIN_STOREUPD256, IX86_BUILTIN_STOREUPS256,
+	IX86_BUILTIN_LDDQU256, IX86_BUILTIN_LOADDQU256,
+	IX86_BUILTIN_STOREDQU256, IX86_BUILTIN_MASKLOADPD,
+	IX86_BUILTIN_MASKLOADPS, IX86_BUILTIN_MASKSTOREPD,
+	IX86_BUILTIN_MASKSTOREPS, IX86_BUILTIN_MASKLOADPD256,
+	IX86_BUILTIN_MASKLOADPS256, IX86_BUILTIN_MASKSTOREPD256,
+	IX86_BUILTIN_MASKSTOREPS256, IX86_BUILTIN_MOVSHDUP256,
+	IX86_BUILTIN_MOVSLDUP256, IX86_BUILTIN_MOVDDUP256,
+	IX86_BUILTIN_SQRTPD256, IX86_BUILTIN_SQRTPS256,
+	IX86_BUILTIN_SQRTPS_NR256, IX86_BUILTIN_RSQRTPS256,
+	IX86_BUILTIN_RSQRTPS_NR256, IX86_BUILTIN_RCPPS256,
+	IX86_BUILTIN_ROUNDPD256, IX86_BUILTIN_ROUNDPS256,
+	IX86_BUILTIN_UNPCKHPD256, IX86_BUILTIN_UNPCKLPD256,
+	IX86_BUILTIN_UNPCKHPS256, IX86_BUILTIN_UNPCKLPS256,
+	IX86_BUILTIN_SI256_SI, IX86_BUILTIN_PS256_PS,
+	IX86_BUILTIN_PD256_PD, IX86_BUILTIN_SI_SI256,
+	IX86_BUILTIN_PS_PS256, IX86_BUILTIN_PD_PD256,
+	IX86_BUILTIN_VTESTZPD, IX86_BUILTIN_VTESTCPD,
+	IX86_BUILTIN_VTESTNZCPD, IX86_BUILTIN_VTESTZPS,
+	IX86_BUILTIN_VTESTCPS, IX86_BUILTIN_VTESTNZCPS,
+	IX86_BUILTIN_VTESTZPD256, IX86_BUILTIN_VTESTCPD256,
+	IX86_BUILTIN_VTESTNZCPD256, IX86_BUILTIN_VTESTZPS256,
+	IX86_BUILTIN_VTESTCPS256, IX86_BUILTIN_VTESTNZCPS256,
+	IX86_BUILTIN_PTESTZ256, IX86_BUILTIN_PTESTC256,
+	IX86_BUILTIN_PTESTNZC256, IX86_BUILTIN_MOVMSKPD256
+	and IX86_BUILTIN_MOVMSKPS256,
+	(ix86_special_builtin_type): Add V32QI_FTYPE_PCCHAR,
+	V8SF_FTYPE_PCV4SF, V8SF_FTYPE_PCFLOAT, V4DF_FTYPE_PCV2DF,
+	V4DF_FTYPE_PCDOUBLE, V8SF_FTYPE_PCV8SF_V8SF,
+	V4DF_FTYPE_PCV4DF_V4DF, V4SF_FTYPE_PCV4SF_V4SF,
+	V2DF_FTYPE_PCV2DF_V2DF, VOID_FTYPE_PCHAR_V32QI,
+	VOID_FTYPE_PFLOAT_V8SF, VOID_FTYPE_PDOUBLE_V4DF,
+	VOID_FTYPE_PV8SF_V8SF_V8SF, VOID_FTYPE_PV4DF_V4DF_V4DF,
+	VOID_FTYPE_PV4SF_V4SF_V4SF and VOID_FTYPE_PV2DF_V2DF_V2DF,
+	(ix86_builtin_type): Add INT_FTYPE_V8SF_V8SF_PTEST,
+	INT_FTYPE_V4DI_V4DI_PTEST, INT_FTYPE_V4DF_V4DF_PTEST,
+	INT_FTYPE_V4SF_V4SF_PTEST, INT_FTYPE_V2DF_V2DF_PTEST,
+	INT_FTYPE_V8SF, INT_FTYPE_V4DF, V8SI_FTYPE_V8SF, V8SI_FTYPE_V4SI,
+	V8SF_FTYPE_V8SF, V8SF_FTYPE_V8SI, V8SF_FTYPE_V4SF,
+	V4SI_FTYPE_V8SI, V4SI_FTYPE_V4DF, V4DF_FTYPE_V4DF,
+	V4DF_FTYPE_V4SI, V4DF_FTYPE_V4SF, V4DF_FTYPE_V2DF,
+	V4SF_FTYPE_V4DF, V4SF_FTYPE_V8SF, V2DF_FTYPE_V4DF,
+	V8SF_FTYPE_V8SF_V8SF, V8SF_FTYPE_V8SF_V8SI,
+	V4DF_FTYPE_V4DF_V4DF, V4DF_FTYPE_V4DF_V4DI,
+	V4SF_FTYPE_V4SF_V4SI, V2DF_FTYPE_V2DF_V2DI,
+	V8SF_FTYPE_V8SF_INT, V4SI_FTYPE_V8SI_INT, V4SF_FTYPE_V8SF_INT,
+	V2DF_FTYPE_V4DF_INT, V4DF_FTYPE_V4DF_INT,
+	V8SF_FTYPE_V8SF_V8SF_V8SF, V4DF_FTYPE_V4DF_V4DF_V4DF,
+	V8SI_FTYPE_V8SI_V8SI_INT, V8SF_FTYPE_V8SF_V8SF_INT,
+	V4DF_FTYPE_V4DF_V4DF_INT, V4DF_FTYPE_V4DF_V2DF_INT,
+	V8SF_FTYPE_V8SF_V8SF_V8SI_INT, V4DF_FTYPE_V4DF_V4DF_V4DI_INT,
+	V4SF_FTYPE_V4SF_V4SF_V4SI_INT and V2DF_FTYPE_V2DF_V2DF_V2DI_INT.
+	(bdesc_special_args): Add IX86_BUILTIN_VZEROALL,
+	IX86_BUILTIN_VZEROUPPER. IX86_BUILTIN_VZEROUPPER_REX64,
+	IX86_BUILTIN_VBROADCASTSS, IX86_BUILTIN_VBROADCASTSD256,
+	IX86_BUILTIN_VBROADCASTSS256, IX86_BUILTIN_VBROADCASTPD256,
+	IX86_BUILTIN_VBROADCASTPS256, IX86_BUILTIN_LOADUPD256,
+	IX86_BUILTIN_LOADUPS256, IX86_BUILTIN_STOREUPD256,
+	IX86_BUILTIN_STOREUPS256, IX86_BUILTIN_LOADDQU256,
+	IX86_BUILTIN_STOREDQU256, IX86_BUILTIN_LDDQU256,
+	IX86_BUILTIN_MASKLOADPD, IX86_BUILTIN_MASKLOADPS,
+	IX86_BUILTIN_MASKLOADPD256, IX86_BUILTIN_MASKLOADPS256,
+	IX86_BUILTIN_MASKSTOREPD, IX86_BUILTIN_MASKSTOREPS,
+	IX86_BUILTIN_MASKSTOREPD256 and IX86_BUILTIN_MASKSTOREPS256.
+	(ix86_builtins): Add IX86_BUILTIN_ADDPD256,
+	IX86_BUILTIN_ADDPS256, IX86_BUILTIN_ADDSUBPD256,
+	IX86_BUILTIN_ADDSUBPS256, IX86_BUILTIN_ANDPD256,
+	IX86_BUILTIN_ANDPS256, IX86_BUILTIN_ANDNPD256,
+	IX86_BUILTIN_ANDNPS256, IX86_BUILTIN_DIVPD256,
+	IX86_BUILTIN_DIVPS256, IX86_BUILTIN_HADDPD256,
+	IX86_BUILTIN_HSUBPS256, IX86_BUILTIN_HSUBPD256,
+	IX86_BUILTIN_HADDPS256, IX86_BUILTIN_MAXPD256,
+	IX86_BUILTIN_MAXPS256, IX86_BUILTIN_MINPD256,
+	IX86_BUILTIN_MINPS256, IX86_BUILTIN_MULPD256,
+	IX86_BUILTIN_MULPS256, IX86_BUILTIN_ORPD256,
+	IX86_BUILTIN_ORPS256, IX86_BUILTIN_SUBPD256,
+	IX86_BUILTIN_SUBPS256, IX86_BUILTIN_XORPD256,
+	IX86_BUILTIN_XORPS256, IX86_BUILTIN_VPERMILVARPD,
+	IX86_BUILTIN_VPERMILVARPS, IX86_BUILTIN_VPERMILVARPD256,
+	IX86_BUILTIN_VPERMILVARPS256, IX86_BUILTIN_BLENDPD256,
+	IX86_BUILTIN_BLENDPS256, IX86_BUILTIN_BLENDVPD256,
+	IX86_BUILTIN_BLENDVPS256, IX86_BUILTIN_DPPS256,
+	IX86_BUILTIN_SHUFPD256, IX86_BUILTIN_SHUFPS256,
+	IX86_BUILTIN_CMPSD, IX86_BUILTIN_CMPSS, IX86_BUILTIN_CMPPD,
+	IX86_BUILTIN_CMPPS,
+	IX86_BUILTIN_CMPPD256,IX86_BUILTIN_CMPPS256,
+	IX86_BUILTIN_EXTRACTF128PD256, IX86_BUILTIN_EXTRACTF128PS256,
+	IX86_BUILTIN_EXTRACTF128SI256, IX86_BUILTIN_CVTDQ2PD256,
+	IX86_BUILTIN_CVTDQ2PS256, IX86_BUILTIN_CVTPD2PS256,
+	IX86_BUILTIN_CVTPS2DQ256, IX86_BUILTIN_CVTPS2PD256,
+	IX86_BUILTIN_CVTTPD2DQ256, IX86_BUILTIN_CVTPD2DQ256,
+	IX86_BUILTIN_CVTTPS2DQ256, IX86_BUILTIN_VPERM2F128PD256,
+	IX86_BUILTIN_VPERM2F128PS256, IX86_BUILTIN_VPERM2F128SI256,
+	IX86_BUILTIN_VPERMILPD, IX86_BUILTIN_VPERMILPS,
+	IX86_BUILTIN_VPERMILPD256, IX86_BUILTIN_VPERMILPS256,
+	IX86_BUILTIN_VPERMIL2PD, IX86_BUILTIN_VPERMILPS,
+	IX86_BUILTIN_VPERMILPD256, IX86_BUILTIN_VPERMILPS256,
+	IX86_BUILTIN_VPERMIL2PD, IX86_BUILTIN_VPERMIL2PS,
+	IX86_BUILTIN_VPERMIL2PD256, IX86_BUILTIN_VPERMIL2PS256,
+	IX86_BUILTIN_VINSERTF128PD256, IX86_BUILTIN_VINSERTF128PS256,
+	IX86_BUILTIN_VINSERTF128SI256, IX86_BUILTIN_MOVSHDUP256,
+	IX86_BUILTIN_MOVSLDUP256, IX86_BUILTIN_MOVDDUP256,
+	IX86_BUILTIN_SQRTPD256, IX86_BUILTIN_SQRTPS256,
+	IX86_BUILTIN_SQRTPS_NR256, IX86_BUILTIN_RSQRTPS256,
+	IX86_BUILTIN_RSQRTPS_NR256, IX86_BUILTIN_RCPPS256,
+	IX86_BUILTIN_ROUNDPD256, IX86_BUILTIN_ROUNDPS256,
+	IX86_BUILTIN_UNPCKHPD256, IX86_BUILTIN_UNPCKLPD256,
+	IX86_BUILTIN_UNPCKHPS256, IX86_BUILTIN_UNPCKLPS256,
+	IX86_BUILTIN_SI256_SI, IX86_BUILTIN_PS256_PS,
+	IX86_BUILTIN_PD256_PD, IX86_BUILTIN_SI_SI256,
+	IX86_BUILTIN_PS_PS256, IX86_BUILTIN_PD_PD256,
+	IX86_BUILTIN_VTESTZPD, IX86_BUILTIN_VTESTCPD,
+	IX86_BUILTIN_VTESTNZCPD, IX86_BUILTIN_VTESTZPS,
+	IX86_BUILTIN_VTESTCPS, IX86_BUILTIN_VTESTNZCPS,
+	IX86_BUILTIN_VTESTZPD256, IX86_BUILTIN_VTESTCPD256,
+	IX86_BUILTIN_VTESTNZCPD256, IX86_BUILTIN_VTESTZPS256,
+	IX86_BUILTIN_VTESTCPS256, IX86_BUILTIN_VTESTNZCPS256,
+	IX86_BUILTIN_PTESTZ256, IX86_BUILTIN_PTESTC256,
+	IX86_BUILTIN_PTESTNZC256, IX86_BUILTIN_MOVMSKPD256 and
+	IX86_BUILTIN_MOVMSKPS256.
+	(ix86_init_mmx_sse_builtins): Support AVX builtins.
+	(ix86_expand_args_builtin): Likewise.
+	(ix86_expand_special_args_builtin): Likewise.
+	(ix86_hard_regno_mode_ok): Handle AVX modes.
+	(ix86_expand_vector_init_duplicate): Likewise.
+	(ix86_expand_vector_init_one_nonzero): Likewise.
+	(ix86_expand_vector_init_one_var): Likewise.
+	(ix86_expand_vector_init_concat): Likewise.
+	(ix86_expand_vector_init_general): Likewise.
+	(ix86_expand_vector_set): Likewise.
+	(ix86_vector_mode_supported_p): Likewise.
+	(x86_extended_reg_mentioned_p): Check INSN_P before using
+	PATTERN.
+
+	* config/i386/i386-c.c (ix86_target_macros_internal): Handle
+	OPTION_MASK_ISA_AVX and OPTION_MASK_ISA_FMA.
+
+	* config/i386/i386.h (TARGET_AVX): New.
+	(TARGET_FMA): Likewise.
+	(TARGET_CPU_CPP_BUILTINS): Handle TARGET_AVX and TARGET_FMA.
+	(BIGGEST_ALIGNMENT): Set to 256 for TARGET_AVX.
+	(VALID_AVX256_REG_MODE): New.
+	(AVX256_VEC_FLOAT_MODE_P): Likewise.
+	(AVX_FLOAT_MODE_P): Likewise.
+	(AVX128_VEC_FLOAT_MODE_P): Likewise.
+	(AVX256_VEC_FLOAT_MODE_P): Likewise.
+	(AVX_VEC_FLOAT_MODE_P): Likewise.
+	(ASM_OUTPUT_AVX_PREFIX): Likewise.
+	(ASM_OUTPUT_OPCODE): Likewise.
+	(UNITS_PER_SIMD_WORD): Add a FIXME for 32byte vectorizer
+	support.
+	(SSE_REG_MODE_P): Allow 256bit vector modes.
+	(ix86_args): Add a warn_avx field.
+
+	* config/i386/i386.md (UNSPEC_PCMP): New.
+	(UNSPEC_VPERMIL): Likewise.
+	(UNSPEC_VPERMIL2): Likewise.
+	(UNSPEC_VPERMIL2F128): Likewise.
+	(UNSPEC_MASKLOAD): Likewise.
+	(UNSPEC_MASKSTORE): Likewise.
+	(UNSPEC_CAST): Likewise.
+	(UNSPEC_VTESTP): Likewise.
+	(UNSPECV_VZEROALL): Likewise.
+	(UNSPECV_VZEROUPPER): Likewise.
+	(XMM0_REG): Likewise.
+	(XMM1_REG): Likewise.
+	(XMM2_REG): Likewise.
+	(XMM3_REG): Likewise.
+	(XMM4_REG): Likewise.
+	(XMM5_REG): Likewise.
+	(XMM6_REG): Likewise.
+	(XMM8_REG): Likewise.
+	(XMM9_REG): Likewise.
+	(XMM10_REG): Likewise.
+	(XMM11_REG): Likewise.
+	(XMM12_REG): Likewise.
+	(XMM13_REG): Likewise.
+	(XMM14_REG): Likewise.
+	(XMM15_REG): Likewise.
+	(prefix): Likewise.
+	(prefix_vex_imm8): Likewise.
+	(prefix_vex_w): Likewise.
+	(length_vex): Likewise.
+	(maxmin): Likewise.
+	(movoi): Likewise.
+	(*avx_ashlti3): Likewise.
+	(*avx_lshrti3): Likewise.
+	(*avx_setcc<mode>): Likewise.
+	(*fop_<mode>_comm_mixed_avx): Likewise.
+	(*fop_<mode>_comm_avx): Likewise.
+	(*fop_<mode>_1_mixed_avx): Likewise.
+	(*fop_<mode>_1_avx): Likewise.
+	(*avx_<code><mode>3): Likewise.
+	(*avx_ieee_smin<mode>3): Likewise.
+	(*avx_ieee_smax<mode>3): Likewise.
+	(mode): Add OI, V8SF and V4DF.
+	(length): Support VEX prefix.
+	(*cmpfp_i_mixed): Set prefix attribute.
+	(*cmpfp_i_sse): Likewise.
+	(*cmpfp_iu_mixed): Likewise.
+	(*cmpfp_iu_sse): Likewise.
+	(*movsi_1): Support AVX.
+	(*movdi_2): Likewise.
+	(*movdi_1_rex64): Likewise.
+	(*movti_internal): Likewise.
+	(*movti_rex64): Likewise.
+	(*movsf_1): Likewise.
+	(*movdf_nointeger): Likewise.
+	(*movdf_integer_rex64): Likewise.
+	(*movtf_internal): Likewise.
+	(zero_extendsidi2_32): Likewise.
+	(zero_extendsidi2_rex64): Likewise.
+	(*extendsfdf2_mixed): Likewise.
+	(*extendsfdf2_sse): Likewise.
+	(*truncdfsf_fast_mixed): Likewise.
+	(*truncdfsf_fast_sse): Likewise.
+	(*truncdfsf_mixed): Likewise.
+	(fix_trunc<mode>di_sse): Likewise.
+	(fix_trunc<mode>si_sse): Likewise.
+	(*float<SSEMODEI24:mode><MODEF:mode>2_mixed_interunit): Likewise.
+	(*float<SSEMODEI24:mode><MODEF:mode>2_mixed_nointerunit): Likewise.
+	(*float<SSEMODEI24:mode><MODEF:mode>2_sse_interunit): Likewise.
+	(*float<SSEMODEI24:mode><MODEF:mode>2_sse_nointerunit): Likewise.
+	(*rcpsf2_sse): Likewise.
+	(*rsqrtsf2_sse): Likewise.
+	(*sqrt<mode>2_sse): Likewise.
+	(sse4_1_round<mode>2): Likewise.
+	(*sse_prologue_save_insn): Disallow REX prefix for AVX.
+	Support AVX.  Set length attribute properly for AVX.
+
+	* config/i386/i386-modes.def (VECTOR_MODES (INT, 32)): New.
+	(VECTOR_MODES (FLOAT, 32)): Likewise.
+	(VECTOR_MODE (INT, DI, 8)): Likewise.
+	(VECTOR_MODE (INT, HI, 32)): Likewise.
+	(VECTOR_MODE (INT, QI, 64)): Likewise.
+	(VECTOR_MODE (FLOAT, DF, 8)): Likewise.
+	(VECTOR_MODE (FLOAT, SF, 16)): Likewise.
+	(VECTOR_MODE (INT, DI, 4)): Removed.
+	(VECTOR_MODE (INT, SI, 8)): Likewise.
+	(VECTOR_MODE (INT, HI, 16)): Likewise.
+	(VECTOR_MODE (INT, QI, 32)): Likewise.
+	(VECTOR_MODE (FLOAT, SF, 8)): Likewise.
+	(INT_MODE (OI, 32)): Likewise.
+
+	* config/i386/i386.opt (mavx): New.
+	(mfma): Likewise.
+
+	* config/i386/i386-protos.h (ix86_attr_length_vex_default): New.
+
+	* config/i386/mmx.md (*mov<mode>_internal_rex64): Support AVX.
+	(*mov<mode>_internal_avx): New.
+	(*movv2sf_internal_rex64_avx): Likewise.
+	(*movv2sf_internal_avx): Likewise.
+
+	* config/i386/predicates.md (const_4_to_5_operand): New.
+	(const_6_to_7_operand): Likewise.
+	(const_8_to_11_operand): Likewise.
+	(const_12_to_15_operand): Likewise.
+	(avx_comparison_float_operator): Likewise.
+
+	* config/i386/sse.md (AVX256MODEI): New.
+	(AVX256MODE): Likewise.
+	(AVXMODEQI): Likewise.
+	(AVXMODE): Likewise.
+	(AVX256MODEF2P): Likewise.
+	(AVX256MODE2P): Likewise.
+	(AVX256MODE4P): Likewise.
+	(AVX256MODE8P): Likewise.
+	(AVXMODEF2P): Likewise.
+	(AVXMODEF4P): Likewise.
+	(AVXMODEDCVTDQ2PS): Likewise.
+	(AVXMODEDCVTPS2DQ): Likewise.
+	(avxvecmode): Likewise.
+	(avxvecpsmode): Likewise.
+	(avxhalfvecmode): Likewise.
+	(avxscalarmode): Likewise.
+	(avxcvtvecmode): Likewise.
+	(avxpermvecmode): Likewise.
+	(avxmodesuffixf2c): Likewise.
+	(avxmodesuffixp): Likewise.
+	(avxmodesuffixs): Likewise.
+	(avxmodesuffix): Likewise.
+	(vpermilbits): Likewise.
+	(pinsrbits): Likewise.
+	(mov<mode>): Likewise.
+	(*mov<mode>_internal): Likewise.
+	(push<mode>1): Likewise.
+	(movmisalign<mode>): Likewise.
+	(avx_movup<avxmodesuffixf2c><avxmodesuffix>): Likewise.
+	(avx_movdqu<avxmodesuffix>): Likewise.
+	(avx_lddqu<avxmodesuffix>): Likewise.
+	(<plusminus_insn><mode>3): Likewise.
+	(*avx_<plusminus_insn><mode>3): Likewise.
+	(*avx_vm<plusminus_insn><mode>3): Likewise.
+	(mul<mode>3): Likewise.
+	(*avx_mul<mode>3): Likewise.
+	(*avx_vmmul<mode>3): Likewise.
+	(divv8sf3): Likewise.
+	(divv4df3): Likewise.
+	(avx_div<mode>3): Likewise.
+	(*avx_div<mode>3): Likewise.
+	(*avx_vmdiv<mode>3): Likewise.
+	(avx_rcpv8sf2): Likewise.
+	(*avx_vmrcpv4sf2): Likewise.
+	(sqrtv8sf2): Likewise.
+	(avx_sqrtv8sf2): Likewise.
+	(*avx_vmsqrt<mode>2): Likewise.
+	(rsqrtv8sf2): Likewise.
+	(avx_rsqrtv8sf2): Likewise.
+	(*avx_vmrsqrtv4sf2): Likewise.
+	(<code><mode>3): Likewise.
+	(*avx_<code><mode>3_finite): Likewise.
+	(*avx_<code><mode>3): Likewise.
+	(*avx_vm<code><mode>3): Likewise.
+	(*avx_ieee_smin<mode>3): Likewise.
+	(*avx_ieee_smax<mode>3): Likewise.
+	(avx_addsubv8sf3): Likewise.
+	(avx_addsubv4df3): Likewise.
+	(*avx_addsubv4sf3): Likewise.
+	(*avx_addsubv2df3): Likewise.
+	(avx_h<plusminus_insn>v4df3): Likewise.
+	(avx_h<plusminus_insn>v8sf3): Likewise.
+	(*avx_h<plusminus_insn>v4sf3): Likewise.
+	(*avx_h<plusminus_insn>v2df3): Likewise.
+	(avx_cmpp<avxmodesuffixf2c><mode>3): Likewise.
+	(avx_cmps<ssemodesuffixf2c><mode>3): Likewise.
+	(*avx_maskcmp<mode>3): Likewise.
+	(avx_nand<mode>3): Likewise.
+	(*avx_<code><mode>3): Likewise.
+	(*avx_nand<mode>3): Likewise.
+	(*avx_<code><mode>3): Likewise.
+	(*avx_cvtsi2ss): Likewise.
+	(*avx_cvtsi2ssq): Likewise.
+	(*avx_cvtsi2sd): Likewise.
+	(*avx_cvtsi2sdq): Likewise.
+	(*avx_cvtsd2ss): Likewise.
+	(avx_cvtss2sd): Likewise.
+	(avx_cvtdq2ps<avxmodesuffix>): Likewise.
+	(avx_cvtps2dq<avxmodesuffix>): Likewise.
+	(avx_cvttps2dq<avxmodesuffix>): Likewise.
+	(*avx_cvtsi2sd): Likewise.
+	(*avx_cvtsi2sdq): Likewise.
+	(avx_cvtdq2pd256): Likewise.
+	(avx_cvtpd2dq256): Likewise.
+	(avx_cvttpd2dq256): Likewise.
+	(*avx_cvtsd2ss): Likewise.
+	(*avx_cvtss2sd): Likewise.
+	(avx_cvtpd2ps256): Likewise.
+	(avx_cvtps2pd256): Likewise.
+	(*avx_movhlps): Likewise.
+	(*avx_movlhps): Likewise.
+	(avx_unpckhps256): Likewise.
+	(*avx_unpckhps): Likewise.
+	(avx_unpcklps256): Likewise.
+	(*avx_unpcklps): Likewise.
+	(avx_movshdup256): Likewise.
+	(avx_movsldup256): Likewise.
+	(avx_shufps256): Likewise.
+	(avx_shufps256_1): Likewise.
+	(*avx_shufps_<mode>): Likewise.
+	(*avx_loadhps): Likewise.
+	(*avx_storelps): Likewise.
+	(*avx_loadlps): Likewise.
+	(*avx_movss): Likewise.
+	(*vec_dupv4sf_avx): Likewise.
+	(*vec_concatv2sf_avx): Likewise.
+	(*vec_concatv4sf_avx): Likewise.
+	(*vec_setv4sf_0_avx): Likewise.
+	(*vec_setv4sf_avx): Likewise.
+	(*avx_insertps): Likewise.
+	(avx_vextractf128<mode>): Likewise.
+	(vec_extract_lo_<mode>): Likewise.
+	(vec_extract_hi_<mode>): Likewise.
+	(vec_extract_lo_<mode>): Likewise.
+	(vec_extract_hi_<mode>): Likewise.
+	(vec_extract_lo_v16hi): Likewise.
+	(vec_extract_hi_v16hi): Likewise.
+	(vec_extract_lo_v32qi): Likewise.
+	(vec_extract_hi_v32qi): Likewise.
+	(avx_unpckhpd256): Likewise.
+	(*avx_unpckhpd): Likewise.
+	(avx_movddup256): Likewise.
+	(*avx_movddup): Likewise.
+	(avx_unpcklpd256): Likewise.
+	(*avx_unpcklpd): Likewise.
+	(avx_shufpd256): Likewise.
+	(avx_shufpd256_1): Likewise.
+	(*avx_punpckhqdq): Likewise.
+	(*avx_punpcklqdq): Likewise.
+	(*avx_shufpd_<mode>): Likewise.
+	(*avx_storehpd): Likewise.
+	(*avx_loadhpd): Likewise.
+	(*avx_loadlpd): Likewise.
+	(*avx_movsd): Likewise.
+	(*vec_concatv2df_avx): Likewise.
+	(*avx_<plusminus_insn><mode>3): Likewise.
+	(*avx_<plusminus_insn><mode>3): Likewise.
+	(*avx_mulv8hi3): Likewise.
+	(*avxv8hi3_highpart): Likewise.
+	(*avx_umulv8hi3_highpart): Likewise.
+	(*avx_umulv2siv2di3): Likewise.
+	(*avx_mulv2siv2di3): Likewise.
+	(*avx_pmaddwd): Likewise.
+	(*avx_mulv4si3): Likewise.
+	(*avx_ashr<mode>3): Likewise.
+	(*avx_lshr<mode>3): Likewise.
+	(*avx_ashl<mode>3): Likewise.
+	(*avx_<code><mode>3): Likewise.
+	(*avx_eq<mode>3): Likewise.
+	(*avx_gt<mode>3): Likewise.
+	(*avx_nand<mode>3): Likewise.
+	(*avx_nand<mode>3): Likewise.
+	(*avx_<code><mode>3): Likewise.
+	(*avx_<code><mode>3): Likewise.
+	(*avx_packsswb): Likewise.
+	(*avx_packssdw): Likewise.
+	(*avx_packuswb): Likewise.
+	(*avx_punpckhbw): Likewise.
+	(*avx_punpcklbw): Likewise.
+	(*avx_punpckhwd): Likewise.
+	(*avx_punpcklwd): Likewise.
+	(*avx_punpckhdq): Likewise.
+	(*avx_punpckldq): Likewise.
+	(*avx_pinsr<avxmodesuffixs>): Likewise.
+	(*avx_pinsrq): Likewise.
+	(*avx_loadld): Likewise.
+	(*vec_extractv2di_1_rex64_avx): Likewise.
+	(*vec_extractv2di_1_avx): Likewise.
+	(*vec_dupv2di_avx): Likewise.
+	(*vec_concatv2si_avx): Likewise.
+	(*vec_concatv4si_1_avx): Likewise.
+	(*vec_concatv2di_avx): Likewise.
+	(*vec_concatv2di_rex64_avx): Likewise.
+	(*avx_uavgv16qi3): Likewise.
+	(*avx_uavgv8hi3): Likewise.
+	(*avx_psadbw): Likewise.
+	(avx_movmskp<avxmodesuffixf2c>256): Likewise.
+	(*avx_phaddwv8hi3): Likewise.
+	(*avx_phadddv4si3): Likewise.
+	(*avx_phaddswv8hi3): Likewise.
+	(*avx_phsubwv8hi3): Likewise.
+	(*avx_phsubdv4si3): Likewise.
+	(*avx_phsubswv8hi3): Likewise.
+	(*avx_pmaddubsw128): Likewise.
+	(*avx_pmulhrswv8hi3): Likewise.
+	(*avx_pshufbv16qi3): Likewise.
+	(*avx_psign<mode>3): Likewise.
+	(*avx_palignrti): Likewise.
+	(avx_blendp<avxmodesuffixf2c><avxmodesuffix>): Likewise.
+	(avx_blendvp<avxmodesuffixf2c><avxmodesuffix>): Likewise.
+	(avx_dpp<avxmodesuffixf2c><avxmodesuffix>): Likewise.
+	(*avx_mpsadbw): Likewise.
+	(*avx_packusdw): Likewise.
+	(*avx_pblendvb): Likewise.
+	(*avx_pblendw): Likewise.
+	(avx_vtestp<avxmodesuffixf2c><avxmodesuffix>): Likewise.
+	(avx_ptest256): Likewise.
+	(avx_roundp<avxmodesuffixf2c>256): Likewise.
+	(*avx_rounds<ssemodesuffixf2c>): Likewise.
+	(*avx_aesenc): Likewise.
+	(*avx_aesenclast): Likewise.
+	(*avx_aesdec): Likewise.
+	(*avx_aesdeclast): Likewise.
+	(avx_vzeroupper): Likewise.
+	(avx_vzeroupper_rex64): Likewise.
+	(avx_vpermil<mode>): Likewise.
+	(avx_vpermilvar<mode>3): Likewise.
+	(avx_vpermil2<mode>3): Likewise.
+	(avx_vperm2f128<mode>3): Likewise.
+	(avx_vbroadcasts<avxmodesuffixf2c><avxmodesuffix>): Likewise.
+	(avx_vbroadcastss256): Likewise.
+	(avx_vbroadcastf128_p<avxmodesuffixf2c>256): Likewise.
+	(avx_vinsertf128<mode>): Likewise.
+	(vec_set_lo_<mode>): Likewise.
+	(vec_set_hi_<mode>): Likewise.
+	(vec_set_lo_<mode>): Likewise.
+	(vec_set_hi_<mode>): Likewise.
+	(vec_set_lo_v16hi): Likewise.
+	(vec_set_hi_v16hi): Likewise.
+	(vec_set_lo_v32qi): Likewise.
+	(vec_set_hi_v32qi): Likewise.
+	(avx_maskloadp<avxmodesuffixf2c><avxmodesuffix>): Likewise.
+	(avx_maskstorep<avxmodesuffixf2c><avxmodesuffix>): Likewise.
+	(avx_<avxmodesuffixp><avxmodesuffix>_<avxmodesuffixp>): Likewise.
+	(avx_<avxmodesuffixp>_<avxmodesuffixp><avxmodesuffix>): Likewise.
+	(vec_init<mode>): Likewise.
+	(*vec_concat<mode>_avx): Likewise.
+	(blendbits): Support V8SF and V4DF.
+	(sse2_movq128): Support AVX.
+	(<sse>_movnt<mode>): Likewise.
+	(sse2_movntv2di): Likewise.
+	(sse_rcpv4sf2): Likewise.
+	(sse_sqrtv4sf2): Likewise.
+	(sse_rsqrtv4sf2): Likewise.
+	(<sse>_comi): Likewise.
+	(<sse>_ucomi): Likewise.
+	(sse_cvtss2si): Likewise.
+	(sse_cvtss2si_2): Likewise.
+	(sse_cvtss2siq): Likewise.
+	(sse_cvtss2siq_2): Likewise.
+	(sse_cvttss2si): Likewise.
+	(sse_cvttss2siq): Likewise.
+	(sse2_cvtsd2si): Likewise.
+	(sse2_cvtsd2si_2): Likewise.
+	(sse2_cvtsd2siq): Likewise.
+	(sse2_cvtsd2siq_2): Likewise.
+	(sse2_cvttsd2si): Likewise.
+	(sse2_cvttsd2siq): Likewise.
+	(sse2_cvtdq2pd): Likewise.
+	(*sse2_cvtpd2dq): Likewise.
+	(*sse2_cvttpd2dq): Likewise.
+	(*sse2_cvtpd2ps): Likewise.
+	(sse2_cvtps2pd): Likewise.
+	(sse3_movshdup): Likewise.
+	(sse3_movsldup): Likewise.
+	(sse_storehps): Likewise.
+	(*sse4_1_extractps): Likewise.
+	(sse2_storelpd): Likewise.
+	(vec_dupv2df_sse3): Likewise.
+	(*vec_concatv2df_sse3): Likewise.
+	(*sse4_1_pextrb): Likewise.
+	(*sse4_1_pextrb_memory): Likewise.
+	(*sse2_pextrw): Likewise.
+	(*sse4_1_pextrw_memory): Likewise.
+	(*sse4_1_pextrd): Likewise.
+	(*sse4_1_pextrq): Likewise.
+	(sse2_pshufd_1): Likewise.
+	(sse2_pshuflw_1): Likewise.
+	(sse2_pshufhw_1): Likewise.
+	(*sse2_storeq_rex64): Likewise.
+	(*vec_dupv4si): Likewise.
+	(<sse>_movmskp<ssemodesuffixf2c>): Likewise.
+	(sse2_pmovmskb): Likewise.
+	(*sse2_maskmovdqu): Likewise.
+	(*sse2_maskmovdqu_rex64): Likewise.
+	(sse_ldmxcsr): Likewise.
+	(sse_stmxcsr): Likewise.
+	(abs<mode>2): Likewise.
+	(sse4_1_movntdqa): Likewise.
+	(sse4_1_phminposuw): Likewise.
+	(sse4_1_extendv8qiv8hi2): Likewise.
+	(*sse4_1_extendv8qiv8hi2): Likewise.
+	(sse4_1_extendv4qiv4si2): Likewise.
+	(*sse4_1_extendv4qiv4si2): Likewise.
+	(sse4_1_extendv2qiv2di2): Likewise.
+	(*sse4_1_extendv2qiv2di2): Likewise.
+	(sse4_1_extendv4hiv4si2): Likewise.
+	(*sse4_1_extendv4hiv4si2): Likewise.
+	(sse4_1_extendv2hiv2di2): Likewise.
+	(*sse4_1_extendv2hiv2di2): Likewise.
+	(sse4_1_extendv2siv2di2): Likewise.
+	(*sse4_1_extendv2siv2di2): Likewise.
+	(sse4_1_zero_extendv8qiv8hi2): Likewise.
+	(*sse4_1_zero_extendv8qiv8hi2): Likewise.
+	(sse4_1_zero_extendv4qiv4si2): Likewise.
+	(*sse4_1_zero_extendv4qiv4si2): Likewise.
+	(sse4_1_zero_extendv2qiv2di2): Likewise.
+	(*sse4_1_zero_extendv2qiv2di2): Likewise.
+	(sse4_1_zero_extendv4hiv4si2): Likewise.
+	(*sse4_1_zero_extendv4hiv4si2): Likewise.
+	(sse4_1_zero_extendv2hiv2di2): Likewise.
+	(*sse4_1_zero_extendv2hiv2di2): Likewise.
+	(sse4_1_zero_extendv2siv2di2): Likewise.
+	(*sse4_1_zero_extendv2siv2di2): Likewise.
+	(sse4_1_ptest): Likewise.
+	(sse4_1_roundp<ssemodesuffixf2c>): Likewise.
+	(sse4_2_pcmpestri): Likewise.
+	(sse4_2_pcmpestrm): Likewise.
+	(sse4_2_pcmpistri): Likewise.
+	(sse4_2_pcmpistrm): Likewise.
+	(aesimc): Likewise.
+	(aeskeygenassist): Likewise.
+
+2008-08-28  Uros Bizjak  <ubizjak@gmail.com>
+
+	* config/i386/predicates.md (vzeroall_operation): New.
+
+	* config/i386/sse.md (avx_vzeroall): New.
+	(*avx_vzeroall): Likewise.
+
 2008-08-28  Paul Brook  <paul@codesourcery.com>
-	Mark Shinwell  <shinwell@codesourcery.com>
-	Richard Earnshaw  <richard.earnshaw@arm.com>
+	    Mark Shinwell  <shinwell@codesourcery.com>
+	    Richard Earnshaw  <richard.earnshaw@arm.com>
 
 	* config/arm/arm.c (TARGET_MAX_ANCHOR_OFFSET): New.
 	(TARGET_MIN_ANCHOR_OFFSET): New.
@@ -94,7 +771,7 @@
 
 2008-08-28  Chris Fairles  <chris.fairles@gmail.com>
 
-        * gthr-posix.h (__gthread_create,  __gthread_join, __gthread_detach,
+	* gthr-posix.h (__gthread_create,  __gthread_join, __gthread_detach,
 	__gthread_mutex_timed_lock, __gthread_recursive_mutex_timed_lock,
 	__gthread_cond_signal, __gthread_cond_timedwait,
 	__gthread_cond_timedwait_recursive): New functions.
@@ -522,8 +1199,8 @@
 
 2008-08-24  Razya Ladelsky  <razya@il.ibm.com>
 
-        PR tree-optimization/37185
-        * matrix-reorg.c (transform_access_sites): Update changed stmt.
+	PR tree-optimization/37185
+	* matrix-reorg.c (transform_access_sites): Update changed stmt.
 
 2008-08-23  Jan Hubicka  <jh@suse.cz>
 
diff --git a/gcc/config.gcc b/gcc/config.gcc
index e06e2ae..877761b 100644
--- a/gcc/config.gcc
+++ b/gcc/config.gcc
@@ -299,7 +299,7 @@ i[34567]86-*-*)
 	extra_headers="cpuid.h mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h
 		       pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h
 		       nmmintrin.h bmmintrin.h mmintrin-common.h
-		       wmmintrin.h cross-stdarg.h"
+		       wmmintrin.h gmmintrin.h cross-stdarg.h"
 	;;
 x86_64-*-*)
 	cpu_type=i386
@@ -308,7 +308,7 @@ x86_64-*-*)
 	extra_headers="cpuid.h mmintrin.h mm3dnow.h xmmintrin.h emmintrin.h
 		       pmmintrin.h tmmintrin.h ammintrin.h smmintrin.h
 		       nmmintrin.h bmmintrin.h mmintrin-common.h
-		       wmmintrin.h cross-stdarg.h"
+		       wmmintrin.h gmmintrin.h cross-stdarg.h"
 	need_64bit_hwint=yes
 	;;
 ia64-*-*)
diff --git a/gcc/config/i386/cpuid.h b/gcc/config/i386/cpuid.h
index ce406c5..90a2813 100644
--- a/gcc/config/i386/cpuid.h
+++ b/gcc/config/i386/cpuid.h
@@ -35,11 +35,15 @@
 #define bit_SSE3	(1 << 0)
 #define bit_PCLMUL	(1 << 1)
 #define bit_SSSE3	(1 << 9)
+#define bit_FMA		(1 << 12)
 #define bit_CMPXCHG16B	(1 << 13)
 #define bit_SSE4_1	(1 << 19)
 #define bit_SSE4_2	(1 << 20)
 #define bit_POPCNT	(1 << 23)
 #define bit_AES		(1 << 25)
+#define bit_XSAVE	(1 << 26)
+#define bit_OSXSAVE	(1 << 27)
+#define bit_AVX		(1 << 28)
 
 /* %edx */
 #define bit_CMPXCHG8B	(1 << 8)
diff --git a/gcc/config/i386/gas.h b/gcc/config/i386/gas.h
index 07d8e77..bf8ac48 100644
--- a/gcc/config/i386/gas.h
+++ b/gcc/config/i386/gas.h
@@ -86,6 +86,7 @@ along with GCC; see the file COPYING3.  If not see
    GAS version 1.38.1 doesn't understand the `repz' opcode mnemonic.
    So use `repe' instead.  */
 
+#undef ASM_OUTPUT_OPCODE
 #define ASM_OUTPUT_OPCODE(STREAM, PTR)	\
 {									\
   if ((PTR)[0] == 'r'							\
@@ -103,6 +104,8 @@ along with GCC; see the file COPYING3.  If not see
 	  (PTR) += 5;							\
 	}								\
     }									\
+  else									\
+    ASM_OUTPUT_AVX_PREFIX ((STREAM), (PTR));				\
 }
 
 /* Define macro used to output shift-double opcodes when the shift
diff --git a/gcc/config/i386/gmmintrin.h b/gcc/config/i386/gmmintrin.h
new file mode 100644
index 0000000..1c6bb18
--- /dev/null
+++ b/gcc/config/i386/gmmintrin.h
@@ -0,0 +1,1482 @@
+/* Copyright (C) 2008 Free Software Foundation, Inc.
+
+   This file is part of GCC.
+
+   GCC is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; either version 2, or (at your option)
+   any later version.
+
+   GCC is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with GCC; see the file COPYING.  If not, write to
+   the Free Software Foundation, 59 Temple Place - Suite 330,
+   Boston, MA 02111-1307, USA.  */
+
+/* As a special exception, if you include this header file into source
+   files compiled by GCC, this header file does not by itself cause
+   the resulting executable to be covered by the GNU General Public
+   License.  This exception does not however invalidate any other
+   reasons why the executable file might be covered by the GNU General
+   Public License.  */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 11.0.  */
+
+#ifndef _GMMINTRIN_H_INCLUDED
+#define _GMMINTRIN_H_INCLUDED
+
+#ifndef __AVX__
+# error "AVX instruction set not enabled"
+#else
+
+/* We need definitions from the SSE4, SSSE3, SSE3, SSE2 and SSE header
+   files.  */
+#include <smmintrin.h>
+
+/* Internal data types for implementing the intrinsics.  */
+typedef double __v4df __attribute__ ((__vector_size__ (32)));
+typedef float __v8sf __attribute__ ((__vector_size__ (32)));
+typedef long long __v4di __attribute__ ((__vector_size__ (32)));
+typedef int __v8si __attribute__ ((__vector_size__ (32)));
+typedef short __v16hi __attribute__ ((__vector_size__ (32)));
+typedef char __v32qi __attribute__ ((__vector_size__ (32)));
+
+/* The Intel API is flexible enough that we must allow aliasing with other
+   vector types, and their scalar components.  */
+typedef float __m256 __attribute__ ((__vector_size__ (32),
+				     __may_alias__));
+typedef long long __m256i __attribute__ ((__vector_size__ (32),
+					  __may_alias__));
+typedef double __m256d __attribute__ ((__vector_size__ (32),
+				       __may_alias__));
+
+/* Compare predicates for scalar and packed compare intrinsics.  */
+
+/* Equal (ordered, non-signaling)  */
+#define _CMP_EQ_OQ	0x00
+/* Less-than (ordered, signaling)  */
+#define _CMP_LT_OS	0x01
+/* Less-than-or-equal (ordered, signaling)  */
+#define _CMP_LE_OS	0x02
+/* Unordered (non-signaling)  */
+#define _CMP_UNORD_Q	0x03
+/* Not-equal (unordered, non-signaling)  */
+#define _CMP_NEQ_UQ	0x04
+/* Not-less-than (unordered, signaling)  */
+#define _CMP_NLT_US	0x05
+/* Not-less-than-or-equal (unordered, signaling)  */
+#define _CMP_NLE_US	0x06
+/* Ordered (nonsignaling)   */
+#define _CMP_ORD_Q	0x07
+/* Equal (unordered, non-signaling)  */
+#define _CMP_EQ_UQ	0x08
+/* Not-greater-than-or-equal (unordered, signaling)  */
+#define _CMP_NGE_US	0x09
+/* Not-greater-than (unordered, signaling)  */
+#define _CMP_NGT_US	0x0a
+/* False (ordered, non-signaling)  */
+#define _CMP_FALSE_OQ	0x0b
+/* Not-equal (ordered, non-signaling)  */
+#define _CMP_NEQ_OQ	0x0c
+/* Greater-than-or-equal (ordered, signaling)  */
+#define _CMP_GE_OS	0x0d
+/* Greater-than (ordered, signaling)  */
+#define _CMP_GT_OS	0x0e
+/* True (unordered, non-signaling)  */
+#define _CMP_TRUE_UQ	0x0f
+/* Equal (ordered, signaling)  */
+#define _CMP_EQ_OS	0x10
+/* Less-than (ordered, non-signaling)  */
+#define _CMP_LT_OQ	0x11
+/* Less-than-or-equal (ordered, non-signaling)  */
+#define _CMP_LE_OQ	0x12
+/* Unordered (signaling)  */
+#define _CMP_UNORD_S	0x13
+/* Not-equal (unordered, signaling)  */
+#define _CMP_NEQ_US	0x14
+/* Not-less-than (unordered, non-signaling)  */
+#define _CMP_NLT_UQ	0x15
+/* Not-less-than-or-equal (unordered, non-signaling)  */
+#define _CMP_NLE_UQ	0x16
+/* Ordered (signaling)  */
+#define _CMP_ORD_S	0x17
+/* Equal (unordered, signaling)  */
+#define _CMP_EQ_US	0x18
+/* Not-greater-than-or-equal (unordered, non-signaling)  */
+#define _CMP_NGE_UQ	0x19
+/* Not-greater-than (unordered, non-signaling)  */
+#define _CMP_NGT_UQ	0x1a
+/* False (ordered, signaling)  */
+#define _CMP_FALSE_OS	0x1b
+/* Not-equal (ordered, signaling)  */
+#define _CMP_NEQ_OS	0x1c
+/* Greater-than-or-equal (ordered, non-signaling)  */
+#define _CMP_GE_OQ	0x1d
+/* Greater-than (ordered, non-signaling)  */
+#define _CMP_GT_OQ	0x1e
+/* True (unordered, signaling)  */
+#define _CMP_TRUE_US	0x1f
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_addpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_add_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_addps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_addsub_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_addsubpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_addsub_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_addsubps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_and_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_andpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_and_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_andps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_andnot_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_andnpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_andnot_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_andnps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+/* Double/single precision floating point blend instructions - select
+   data from 2 sources using constant/variable mask.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blend_pd (__m256d __X, __m256d __Y, const int __M)
+{
+  return (__m256d) __builtin_ia32_blendpd256 ((__v4df)__X,
+					      (__v4df)__Y,
+					      __M);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blend_ps (__m256 __X, __m256 __Y, const int __M)
+{
+  return (__m256) __builtin_ia32_blendps256 ((__v8sf)__X,
+					     (__v8sf)__Y,
+					     __M);
+}
+#else
+#define _mm256_blend_pd(X, Y, M)					\
+  ((__m256d) __builtin_ia32_blendpd256 ((__v4df)(__m256d)(X),		\
+					(__v4df)(__m256d)(Y), (int)(M)))
+
+#define _mm256_blend_ps(X, Y, M)					\
+  ((__m256) __builtin_ia32_blendps256 ((__v8sf)(__m256)(X),		\
+				       (__v8sf)(__m256)(Y), (int)(M)))
+#endif
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blendv_pd (__m256d __X, __m256d __Y, __m256d __M)
+{
+  return (__m256d) __builtin_ia32_blendvpd256 ((__v4df)__X,
+					       (__v4df)__Y,
+					       (__v4df)__M);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_blendv_ps (__m256 __X, __m256 __Y, __m256 __M)
+{
+  return (__m256) __builtin_ia32_blendvps256 ((__v8sf)__X,
+					      (__v8sf)__Y,
+					      (__v8sf)__M);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_div_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_divpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_div_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_divps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+/* Dot product instructions with mask-defined summing and zeroing parts
+   of result.  */
+
+#ifdef __OPTIMIZE__
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_dp_ps (__m256 __X, __m256 __Y, const int __M)
+{
+  return (__m256) __builtin_ia32_dpps256 ((__v8sf)__X,
+					  (__v8sf)__Y,
+					  __M);
+}
+#else
+#define _mm256_dp_ps(X, Y, M)						\
+  ((__m256) __builtin_ia32_dpps256 ((__v8sf)(__m256)(X),		\
+				    (__v8sf)(__m256)(Y), (int)(M)))
+#endif
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hadd_pd (__m256d __X, __m256d __Y)
+{
+  return (__m256d) __builtin_ia32_haddpd256 ((__v4df)__X, (__v4df)__Y);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hadd_ps (__m256 __X, __m256 __Y)
+{
+  return (__m256) __builtin_ia32_haddps256 ((__v8sf)__X, (__v8sf)__Y);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hsub_pd (__m256d __X, __m256d __Y)
+{
+  return (__m256d) __builtin_ia32_hsubpd256 ((__v4df)__X, (__v4df)__Y);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_hsub_ps (__m256 __X, __m256 __Y)
+{
+  return (__m256) __builtin_ia32_hsubps256 ((__v8sf)__X, (__v8sf)__Y);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_maxpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_max_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_maxps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_minpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_min_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_minps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mul_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_mulpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_mul_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_mulps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_or_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_orpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_or_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_orps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shuffle_pd (__m256d __A, __m256d __B, const int __mask)
+{
+  return (__m256d) __builtin_ia32_shufpd256 ((__v4df)__A, (__v4df)__B,
+					     __mask);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_shuffle_ps (__m256 __A, __m256 __B, const int __mask)
+{
+  return (__m256) __builtin_ia32_shufps256 ((__v8sf)__A, (__v8sf)__B,
+					    __mask);
+}
+#else
+#define _mm256_shuffle_pd(A, B, N)					\
+  ((__m256d)__builtin_ia32_shufpd256 ((__v4df)(__m256d)(A),		\
+				      (__v4df)(__m256d)(B), (int)(N)))
+
+#define _mm256_shuffle_ps(A, B, N)					\
+  ((__m256) __builtin_ia32_shufps256 ((__v8sf)(__m256)(A),		\
+				      (__v8sf)(__m256)(B), (int)(N)))
+#endif
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_subpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sub_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_subps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_xor_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_xorpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_xor_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_xorps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_pd (__m128d __X, __m128d __Y, const int __P)
+{
+  return (__m128d) __builtin_ia32_cmppd ((__v2df)__X, (__v2df)__Y, __P);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_ps (__m128 __X, __m128 __Y, const int __P)
+{
+  return (__m128) __builtin_ia32_cmpps ((__v4sf)__X, (__v4sf)__Y, __P);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_pd (__m256d __X, __m256d __Y, const int __P)
+{
+  return (__m256d) __builtin_ia32_cmppd256 ((__v4df)__X, (__v4df)__Y,
+					    __P);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cmp_ps (__m256 __X, __m256 __Y, const int __P)
+{
+  return (__m256) __builtin_ia32_cmpps256 ((__v8sf)__X, (__v8sf)__Y,
+					   __P);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_sd (__m128d __X, __m128d __Y, const int __P)
+{
+  return (__m128d) __builtin_ia32_cmpsd ((__v2df)__X, (__v2df)__Y, __P);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmp_ss (__m128 __X, __m128 __Y, const int __P)
+{
+  return (__m128) __builtin_ia32_cmpss ((__v4sf)__X, (__v4sf)__Y, __P);
+}
+#else
+#define _mm_cmp_pd(X, Y, P)						\
+  ((__m128d) __builtin_ia32_cmppd ((__v2df)(__m128d)(X),		\
+				   (__v2df)(__m128d)(Y), (int)(P)))
+
+#define _mm_cmp_ps(X, Y, P)						\
+  ((__m128) __builtin_ia32_cmpps ((__v4sf)(__m128)(X),			\
+				  (__v4sf)(__m128)(Y), (int)(P)))
+
+#define _mm256_cmp_pd(X, Y, P)						\
+  ((__m256d) __builtin_ia32_cmppd256 ((__v4df)(__m256d)(X),		\
+				      (__v4df)(__m256d)(Y), (int)(P)))
+
+#define _mm256_cmp_ps(X, Y, P)						\
+  ((__m256) __builtin_ia32_cmpps256 ((__v8sf)(__m256)(X),		\
+				     (__v8sf)(__m256)(Y), (int)(P)))
+
+#define _mm_cmp_sd(X, Y, P)						\
+  ((__m128d) __builtin_ia32_cmpsd ((__v2df)(__m128d)(X),		\
+				   (__v2df)(__m128d)(Y), (int)(P)))
+
+#define _mm_cmp_ss(X, Y, P)						\
+  ((__m128) __builtin_ia32_cmpss ((__v4sf)(__m128)(X),			\
+				  (__v4sf)(__m128)(Y), (int)(P)))
+#endif
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi32_pd (__m128i __A)
+{
+  return (__m256d)__builtin_ia32_cvtdq2pd256 ((__v4si) __A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtepi32_ps (__m256i __A)
+{
+  return (__m256)__builtin_ia32_cvtdq2ps256 ((__v8si) __A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtpd_ps (__m256d __A)
+{
+  return (__m128)__builtin_ia32_cvtpd2ps256 ((__v4df) __A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtps_epi32 (__m256 __A)
+{
+  return (__m256i)__builtin_ia32_cvtps2dq256 ((__v8sf) __A);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtps_pd (__m128 __A)
+{
+  return (__m256d)__builtin_ia32_cvtps2pd256 ((__v4sf) __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttpd_epi32 (__m256d __A)
+{
+  return (__m128i)__builtin_ia32_cvttpd2dq256 ((__v4df) __A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvtpd_epi32 (__m256d __A)
+{
+  return (__m128i)__builtin_ia32_cvtpd2dq256 ((__v4df) __A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_cvttps_epi32 (__m256 __A)
+{
+  return (__m256i)__builtin_ia32_cvttps2dq256 ((__v8sf) __A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extractf128_pd (__m256d __X, const int __N)
+{
+  return (__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)__X, __N);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extractf128_ps (__m256 __X, const int __N)
+{
+  return (__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)__X, __N);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extractf128_si256 (__m256i __X, const int __N)
+{
+  return (__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)__X, __N);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extract_epi32 (__m256i __X, int const __N)
+{
+  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
+  return _mm_extract_epi32 (__Y, __N % 4);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extract_epi16 (__m256i __X, int const __N)
+{
+  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
+  return _mm_extract_epi16 (__Y, __N % 8);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extract_epi8 (__m256i __X, int const __N)
+{
+  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
+  return _mm_extract_epi8 (__Y, __N % 16);
+}
+
+#ifdef __x86_64__
+extern __inline long long  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_extract_epi64 (__m256i __X, const int __N)
+{
+  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
+  return _mm_extract_epi64 (__Y, __N % 2);
+}
+#endif
+#else
+#define _mm256_extractf128_pd(X, N)					\
+  ((__m128d) __builtin_ia32_vextractf128_pd256 ((__v4df)(__m256d)(X),	\
+						(int)(N)))
+
+#define _mm256_extractf128_ps(X, N)					\
+  ((__m128) __builtin_ia32_vextractf128_ps256 ((__v8sf)(__m256)(X),	\
+					       (int)(N)))
+
+#define _mm256_extractf128_si256(X, N)					\
+  ((__m128i) __builtin_ia32_vextractf128_si256 ((__v8si)(__m256i)(X),	\
+						(int)(N)))
+
+#define _mm256_extract_epi32(X, N)					\
+  (__extension__							\
+   ({									\
+      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2);		\
+      _mm_extract_epi32 (__Y, (N) % 4);					\
+    }))
+
+#define _mm256_extract_epi16(X, N)					\
+  (__extension__							\
+   ({									\
+      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3);		\
+      _mm_extract_epi16 (__Y, (N) % 8);					\
+    }))
+
+#define _mm256_extract_epi8(X, N)					\
+  (__extension__							\
+   ({									\
+      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4);		\
+      _mm_extract_epi8 (__Y, (N) % 16);					\
+    }))
+
+#ifdef __x86_64__
+#define _mm256_extract_epi64(X, N)					\
+  (__extension__							\
+   ({									\
+      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1);		\
+      _mm_extract_epi64 (__Y, (N) % 2);					\
+    }))
+#endif
+#endif
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_zeroall (void)
+{
+  __builtin_ia32_vzeroall ();
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_zeroupper (void)
+{
+  __builtin_ia32_vzeroupper ();
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutevar_pd (__m128d __A, __m128i __C)
+{
+  return (__m128d) __builtin_ia32_vpermilvarpd ((__v2df)__A,
+						(__v2di)__C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutevar_pd (__m256d __A, __m256i __C)
+{
+  return (__m256d) __builtin_ia32_vpermilvarpd256 ((__v4df)__A,
+						   (__v4di)__C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permutevar_ps (__m128 __A, __m128i __C)
+{
+  return (__m128) __builtin_ia32_vpermilvarps ((__v4sf)__A,
+					       (__v4si)__C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permutevar_ps (__m256 __A, __m256i __C)
+{
+  return (__m256) __builtin_ia32_vpermilvarps256 ((__v8sf)__A,
+						  (__v8si)__C);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permute_pd (__m128d __X, const int __C)
+{
+  return (__m128d) __builtin_ia32_vpermilpd ((__v2df)__X, __C);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute_pd (__m256d __X, const int __C)
+{
+  return (__m256d) __builtin_ia32_vpermilpd256 ((__v4df)__X, __C);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permute_ps (__m128 __X, const int __C)
+{
+  return (__m128) __builtin_ia32_vpermilps ((__v4sf)__X, __C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute_ps (__m256 __X, const int __C)
+{
+  return (__m256) __builtin_ia32_vpermilps256 ((__v8sf)__X, __C);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permute2_pd (__m128d __X, __m128d __Y, __m128i __C, const int __I)
+{
+  return (__m128d) __builtin_ia32_vpermil2pd ((__v2df)__X,
+					      (__v2df)__Y,
+					      (__v2di)__C,
+					      __I);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute2_pd (__m256d __X, __m256d __Y, __m256i __C, const int __I)
+{
+  return (__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)__X,
+						 (__v4df)__Y,
+						 (__v4di)__C,
+						 __I);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_permute2_ps (__m128 __X, __m128 __Y, __m128i __C, const int __I)
+{
+  return (__m128) __builtin_ia32_vpermil2ps ((__v4sf)__X,
+					     (__v4sf)__Y,
+					     (__v4si)__C,
+					     __I);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute2_ps (__m256 __X, __m256 __Y, __m256i __C, const int __I)
+{
+  return (__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)__X,
+						(__v8sf)__Y,
+						(__v8si)__C,
+						__I);
+}
+#else
+#define _mm_permute_pd(X, C)						\
+  ((__m128d) __builtin_ia32_vpermilpd ((__v2df)(__m128d)(X), (int)(C)))
+
+#define _mm256_permute_pd(X, C)						\
+  ((__m256d) __builtin_ia32_vpermilpd256 ((__v4df)(__m256d)(X),	(int)(C)))
+
+#define _mm_permute_ps(X, C)						\
+  ((__m128) __builtin_ia32_vpermilps ((__v4sf)(__m128)(X), (int)(C)))
+
+#define _mm256_permute_ps(X, C)						\
+  ((__m256) __builtin_ia32_vpermilps256 ((__v8sf)(__m256)(X), (int)(C)))
+
+#define _mm_permute2_pd(X, Y, C, I)					\
+  ((__m128d) __builtin_ia32_vpermil2pd ((__v2df)(__m128d)(X),		\
+					(__v2df)(__m128d)(Y),		\
+					(__v2di)(__m128d)(C),		\
+					(int)(I)))
+
+#define _mm256_permute2_pd(X, Y, C, I)					\
+  ((__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)(__m256d)(X),	\
+					   (__v4df)(__m256d)(Y),	\
+					   (__v4di)(__m256d)(C),	\
+					   (int)(I)))
+
+#define _mm_permute2_ps(X, Y, C, I)					\
+  ((__m128) __builtin_ia32_vpermil2ps ((__v4sf)(__m128)(X),		\
+				       (__v4sf)(__m128)(Y),		\
+				       (__v4si)(__m128)(C),		\
+				       (int)(I)))
+
+#define _mm256_permute2_ps(X, Y, C, I)					\
+  ((__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)(__m256)(X),		\
+					  (__v8sf)(__m256)(Y),  	\
+					  (__v8si)(__m256)(C),		\
+					  (int)(I)))
+#endif
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute2f128_pd (__m256d __X, __m256d __Y, const int __C)
+{
+  return (__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)__X,
+						    (__v4df)__Y,
+						    __C);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute2f128_ps (__m256 __X, __m256 __Y, const int __C)
+{
+  return (__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)__X,
+						   (__v8sf)__Y,
+						   __C);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_permute2f128_si256 (__m256i __X, __m256i __Y, const int __C)
+{
+  return (__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)__X,
+						    (__v8si)__Y,
+						    __C);
+}
+#else
+#define _mm256_permute2f128_pd(X, Y, C)					\
+  ((__m256d) __builtin_ia32_vperm2f128_pd256 ((__v4df)(__m256d)(X),	\
+					      (__v4df)(__m256d)(Y),	\
+					      (int)(C)))
+
+#define _mm256_permute2f128_ps(X, Y, C)					\
+  ((__m256) __builtin_ia32_vperm2f128_ps256 ((__v8sf)(__m256)(X),	\
+					     (__v8sf)(__m256)(Y),	\
+					     (int)(C)))
+
+#define _mm256_permute2f128_si256(X, Y, C)				\
+  ((__m256i) __builtin_ia32_vperm2f128_si256 ((__v8si)(__m256i)(X),	\
+					      (__v8si)(__m256i)(Y),	\
+					      (int)(C)))
+#endif
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_broadcast_ss (float const *__X)
+{
+  return (__m128) __builtin_ia32_vbroadcastss (__X);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_sd (double const *__X)
+{
+  return (__m256d) __builtin_ia32_vbroadcastsd256 (__X);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_ss (float const *__X)
+{
+  return (__m256) __builtin_ia32_vbroadcastss256 (__X);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_pd (__m128d const *__X)
+{
+  return (__m256d) __builtin_ia32_vbroadcastf128_pd256 (__X);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_broadcast_ps (__m128 const *__X)
+{
+  return (__m256) __builtin_ia32_vbroadcastf128_ps256 (__X);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insertf128_pd (__m256d __X, __m128d __Y, const int __O)
+{
+  return (__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)__X,
+						     (__v2df)__Y,
+						     __O);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insertf128_ps (__m256 __X, __m128 __Y, const int __O)
+{
+  return (__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)__X,
+						    (__v4sf)__Y,
+						    __O);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insertf128_si256 (__m256i __X, __m128i __Y, const int __O)
+{
+  return (__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)__X,
+						     (__v4si)__Y,
+						     __O);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insert_epi32 (__m256i __X, int __D, int const __N)
+{
+  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 2);
+  __Y = _mm_insert_epi16 (__Y, __D, __N % 4);
+  return _mm256_insertf128_si256 (__X, __Y, __N >> 2);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insert_epi16 (__m256i __X, int __D, int const __N)
+{
+  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 3);
+  __Y = _mm_insert_epi16 (__Y, __D, __N % 8);
+  return _mm256_insertf128_si256 (__X, __Y, __N >> 3);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insert_epi8 (__m256i __X, int __D, int const __N)
+{
+  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 4);
+  __Y = _mm_insert_epi8 (__Y, __D, __N % 16);
+  return _mm256_insertf128_si256 (__X, __Y, __N >> 4);
+}
+
+#ifdef __x86_64__
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_insert_epi64 (__m256i __X, int __D, int const __N)
+{
+  __m128i __Y = _mm256_extractf128_si256 (__X, __N >> 1);
+  __Y = _mm_insert_epi16 (__Y, __D, __N % 2);
+  return _mm256_insertf128_si256 (__X, __Y, __N >> 1);
+}
+#endif
+#else
+#define _mm256_insertf128_pd(X, Y, O)					\
+  ((__m256d) __builtin_ia32_vinsertf128_pd256 ((__v4df)(__m256d)(X),	\
+					       (__v2df)(__m128d)(Y),	\
+					       (int)(O)))
+
+#define _mm256_insertf128_ps(X, Y, O)					\
+  ((__m256) __builtin_ia32_vinsertf128_ps256 ((__v8sf)(__m256)(X),	\
+					      (__v4sf)(__m128)(Y),  	\
+					      (int)(O)))
+
+#define _mm256_insertf128_si256(X, Y, O)				\
+  ((__m256i) __builtin_ia32_vinsertf128_si256 ((__v8si)(__m256i)(X),	\
+					       (__v4si)(__m128i)(Y),	\
+					       (int)(O)))
+
+#define _mm256_insert_epi32(X, D, N)					\
+  (__extension__							\
+   ({									\
+      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 2);		\
+      __Y = _mm_insert_epi32 (__Y, (D), (N) % 4);			\
+      _mm256_insertf128_si256 ((X), __Y, (N) >> 2);			\
+    }))
+
+#define _mm256_insert_epi16(X, D, N)					\
+  (__extension__							\
+   ({									\
+      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 3);		\
+      __Y = _mm_insert_epi16 (__Y, (D), (N) % 8);			\
+      _mm256_insertf128_si256 ((X), __Y, (N) >> 3);			\
+    }))
+
+#define _mm256_insert_epi8(X, D, N)					\
+  (__extension__							\
+   ({									\
+      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 4);		\
+      __Y = _mm_insert_epi8 (__Y, (D), (N) % 16);			\
+      _mm256_insertf128_si256 ((X), __Y, (N) >> 4);			\
+    }))
+
+#ifdef __x86_64__
+#define _mm256_insert_epi64(X, D, N)					\
+  (__extension__							\
+   ({									\
+      __m128i __Y = _mm256_extractf128_si256 ((X), (N) >> 1);		\
+      __Y = _mm_insert_epi64 (__Y, (D), (N) % 2);			\
+      _mm256_insertf128_si256 ((X), __Y, (N) >> 1);			\
+    }))
+#endif
+#endif
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_load_pd (double const *__P)
+{
+  return *(__m256d *)__P;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_store_pd (double *__P, __m256d __A)
+{
+  *(__m256d *)__P = __A;
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_load_ps (float const *__P)
+{
+  return *(__m256 *)__P;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_store_ps (float *__P, __m256 __A)
+{
+  *(__m256 *)__P = __A;
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_pd (double const *__P)
+{
+  return (__m256d) __builtin_ia32_loadupd256 (__P);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_pd (double *__P, __m256d __A)
+{
+  __builtin_ia32_storeupd256 (__P, (__v4df)__A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_ps (float const *__P)
+{
+  return (__m256) __builtin_ia32_loadups256 (__P);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_ps (float *__P, __m256 __A)
+{
+  __builtin_ia32_storeups256 (__P, (__v8sf)__A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_load_si256 (__m256i const *__P)
+{
+  return *__P;
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_store_si256 (__m256i *__P, __m256i __A)
+{
+  *__P = __A;
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_loadu_si256 (__m256i const *__P)
+{
+  return (__m256i) __builtin_ia32_loaddqu256 ((char const *)__P);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_storeu_si256 (__m256i *__P, __m256i __A)
+{
+  __builtin_ia32_storedqu256 ((char *)__P, (__v32qi)__A);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskload_pd (double const *__P, __m128d __M)
+{
+  return (__m128d) __builtin_ia32_maskloadpd ((const __v2df *)__P,
+					      (__v2df)__M);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskstore_pd (double *__P, __m128d __M, __m128d __A)
+{
+  __builtin_ia32_maskstorepd ((__v2df *)__P, (__v2df)__M, (__v2df)__A);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskload_pd (double const *__P, __m256d __M)
+{
+  return (__m256d) __builtin_ia32_maskloadpd256 ((const __v4df *)__P,
+						 (__v4df)__M);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskstore_pd (double *__P, __m256d __M, __m256d __A)
+{
+  __builtin_ia32_maskstorepd256 ((__v4df *)__P, (__v4df)__M, (__v4df)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskload_ps (float const *__P, __m128 __M)
+{
+  return (__m128) __builtin_ia32_maskloadps ((const __v4sf *)__P,
+					     (__v4sf)__M);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maskstore_ps (float *__P, __m128 __M, __m128 __A)
+{
+  __builtin_ia32_maskstoreps ((__v4sf *)__P, (__v4sf)__M, (__v4sf)__A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskload_ps (float const *__P, __m256 __M)
+{
+  return (__m256) __builtin_ia32_maskloadps256 ((const __v8sf *)__P,
+						(__v8sf)__M);
+}
+
+extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_maskstore_ps (float *__P, __m256 __M, __m256 __A)
+{
+  __builtin_ia32_maskstoreps256 ((__v8sf *)__P, (__v8sf)__M, (__v8sf)__A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movehdup_ps (__m256 __X)
+{
+  return (__m256) __builtin_ia32_movshdup256 ((__v8sf)__X);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_moveldup_ps (__m256 __X)
+{
+  return (__m256) __builtin_ia32_movsldup256 ((__v8sf)__X);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movedup_pd (__m256d __X)
+{
+  return (__m256d) __builtin_ia32_movddup256 ((__v4df)__X);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_lddqu_si256 (__m256i const *__P)
+{
+  return (__m256i) __builtin_ia32_lddqu256 ((char const *)__P);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rcp_ps (__m256 __A)
+{
+  return (__m256) __builtin_ia32_rcpps256 ((__v8sf)__A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_rsqrt_ps (__m256 __A)
+{
+  return (__m256) __builtin_ia32_rsqrtps256 ((__v8sf)__A);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sqrt_pd (__m256d __A)
+{
+  return (__m256d) __builtin_ia32_sqrtpd256 ((__v4df)__A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_sqrt_ps (__m256 __A)
+{
+  return (__m256) __builtin_ia32_sqrtps256 ((__v8sf)__A);
+}
+
+#ifdef __OPTIMIZE__
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_round_pd (__m256d __V, const int __M)
+{
+  return (__m256d) __builtin_ia32_roundpd256 ((__v4df)__V, __M);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_round_ps (__m256 __V, const int __M)
+{
+  return (__m256) __builtin_ia32_roundps256 ((__v8sf)__V, __M);
+}
+#else
+#define _mm256_round_pd(V, M) \
+  ((__m256d) __builtin_ia32_roundpd256 ((__v4df)(__m256d)(V), (int)(M)))
+
+#define _mm256_round_ps(V, M) \
+  ((__m256) __builtin_ia32_roundps256 ((__v8sf)(__m256)(V), (int)(M)))
+#endif
+
+#define _mm256_ceil_pd(V)	_mm256_round_pd ((V), _MM_FROUND_CEIL)
+#define _mm256_floor_pd(V)	_mm256_round_pd ((V), _MM_FROUND_FLOOR)
+#define _mm256_ceil_ps(V)	_mm256_round_ps ((V), _MM_FROUND_CEIL)
+#define _mm256_floor_ps(V)	_mm256_round_ps ((V), _MM_FROUND_FLOOR)
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpackhi_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_unpckhpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpacklo_pd (__m256d __A, __m256d __B)
+{
+  return (__m256d) __builtin_ia32_unpcklpd256 ((__v4df)__A, (__v4df)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpackhi_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_unpckhps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_unpacklo_ps (__m256 __A, __m256 __B)
+{
+  return (__m256) __builtin_ia32_unpcklps256 ((__v8sf)__A, (__v8sf)__B);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testz_pd (__m128d __M, __m128d __V)
+{
+  return __builtin_ia32_vtestzpd ((__v2df)__M, (__v2df)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testc_pd (__m128d __M, __m128d __V)
+{
+  return __builtin_ia32_vtestcpd ((__v2df)__M, (__v2df)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testnzc_pd (__m128d __M, __m128d __V)
+{
+  return __builtin_ia32_vtestnzcpd ((__v2df)__M, (__v2df)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testz_ps (__m128 __M, __m128 __V)
+{
+  return __builtin_ia32_vtestzps ((__v4sf)__M, (__v4sf)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testc_ps (__m128 __M, __m128 __V)
+{
+  return __builtin_ia32_vtestcps ((__v4sf)__M, (__v4sf)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testnzc_ps (__m128 __M, __m128 __V)
+{
+  return __builtin_ia32_vtestnzcps ((__v4sf)__M, (__v4sf)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testz_pd (__m256d __M, __m256d __V)
+{
+  return __builtin_ia32_vtestzpd256 ((__v4df)__M, (__v4df)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testc_pd (__m256d __M, __m256d __V)
+{
+  return __builtin_ia32_vtestcpd256 ((__v4df)__M, (__v4df)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testnzc_pd (__m256d __M, __m256d __V)
+{
+  return __builtin_ia32_vtestnzcpd256 ((__v4df)__M, (__v4df)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testz_ps (__m256 __M, __m256 __V)
+{
+  return __builtin_ia32_vtestzps256 ((__v8sf)__M, (__v8sf)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testc_ps (__m256 __M, __m256 __V)
+{
+  return __builtin_ia32_vtestcps256 ((__v8sf)__M, (__v8sf)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testnzc_ps (__m256 __M, __m256 __V)
+{
+  return __builtin_ia32_vtestnzcps256 ((__v8sf)__M, (__v8sf)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testz_si256 (__m256i __M, __m256i __V)
+{
+  return __builtin_ia32_ptestz256 ((__v4di)__M, (__v4di)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testc_si256 (__m256i __M, __m256i __V)
+{
+  return __builtin_ia32_ptestc256 ((__v4di)__M, (__v4di)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_testnzc_si256 (__m256i __M, __m256i __V)
+{
+  return __builtin_ia32_ptestnzc256 ((__v4di)__M, (__v4di)__V);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movemask_pd (__m256d __A)
+{
+  return __builtin_ia32_movmskpd256 ((__v4df)__A);
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_movemask_ps (__m256 __A)
+{
+  return __builtin_ia32_movmskps256 ((__v8sf)__A);
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setzero_pd (void)
+{
+  return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setzero_ps (void)
+{
+  return __extension__ (__m256){ 0.0, 0.0, 0.0, 0.0,
+				 0.0, 0.0, 0.0, 0.0 };
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setzero_si256 (void)
+{
+  return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
+}
+
+/* Create the vector [A B C D].  */
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_pd (double __A, double __B, double __C, double __D)
+{
+  return __extension__ (__m256d){ __D, __C, __B, __A };
+}
+
+/* Create the vector [A B C D E F G H].  */
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_ps (float __A, float __B, float __C, float __D,
+	       float __E, float __F, float __G, float __H)
+{
+  return __extension__ (__m256){ __H, __G, __F, __E,
+				 __D, __C, __B, __A };
+}
+
+/* Create the vector [A B C D E F G H].  */
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_epi32 (int __A, int __B, int __C, int __D,
+		  int __E, int __F, int __G, int __H)
+{
+  return __extension__ (__m256i)(__v8si){ __H, __G, __F, __E,
+					  __D, __C, __B, __A };
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_epi16 (short __q15, short __q14, short __q13, short __q12,
+		  short __q11, short __q10, short __q09, short __q08,
+		  short __q07, short __q06, short __q05, short __q04,
+		  short __q03, short __q02, short __q01, short __q00)
+{
+  return __extension__ (__m256i)(__v16hi){
+    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
+    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
+  };
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_epi8  (char __q31, char __q30, char __q29, char __q28,
+		  char __q27, char __q26, char __q25, char __q24,
+		  char __q23, char __q22, char __q21, char __q20,
+		  char __q19, char __q18, char __q17, char __q16,
+		  char __q15, char __q14, char __q13, char __q12,
+		  char __q11, char __q10, char __q09, char __q08,
+		  char __q07, char __q06, char __q05, char __q04,
+		  char __q03, char __q02, char __q01, char __q00)
+{
+  return __extension__ (__m256i)(__v32qi){
+    __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
+    __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15,
+    __q16, __q17, __q18, __q19, __q20, __q21, __q22, __q23,
+    __q24, __q25, __q26, __q27, __q28, __q29, __q30, __q31
+  };
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set_epi64x (long long __A, long long __B, long long __C,
+		   long long __D)
+{
+  return __extension__ (__m256i)(__v4di){ __D, __C, __B, __A };
+}
+
+/* Create a vector with all elements equal to A.  */
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_pd (double __A)
+{
+  return __extension__ (__m256d){ __A, __A, __A, __A };
+}
+
+/* Create a vector with all elements equal to A.  */
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_ps (float __A)
+{
+  return __extension__ (__m256){ __A, __A, __A, __A,
+				 __A, __A, __A, __A };
+}
+
+/* Create a vector with all elements equal to A.  */
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_epi32 (int __A)
+{
+  return __extension__ (__m256i)(__v8si){ __A, __A, __A, __A,
+					  __A, __A, __A, __A };
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_epi16 (short __A)
+{
+  return _mm256_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A,
+			   __A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_epi8 (char __A)
+{
+  return _mm256_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
+			  __A, __A, __A, __A, __A, __A, __A, __A,
+			  __A, __A, __A, __A, __A, __A, __A, __A,
+			  __A, __A, __A, __A, __A, __A, __A, __A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_set1_epi64x (long long __A)
+{
+  return __extension__ (__m256i)(__v4di){ __A, __A, __A, __A };
+}
+
+/* Create vectors of elements in the reversed order from the
+   _mm256_set_XXX functions.  */
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_pd (double __A, double __B, double __C, double __D)
+{
+  return _mm256_set_pd (__D, __C, __B, __A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_ps (float __A, float __B, float __C, float __D,
+		float __E, float __F, float __G, float __H)
+{
+  return _mm256_set_ps (__H, __G, __F, __E, __D, __C, __B, __A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_epi32 (int __A, int __B, int __C, int __D,
+		   int __E, int __F, int __G, int __H)
+{
+  return _mm256_set_epi32 (__H, __G, __F, __E, __D, __C, __B, __A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_epi16 (short __q15, short __q14, short __q13, short __q12,
+		   short __q11, short __q10, short __q09, short __q08,
+		   short __q07, short __q06, short __q05, short __q04,
+		   short __q03, short __q02, short __q01, short __q00)
+{
+  return _mm256_set_epi16 (__q00, __q01, __q02, __q03,
+			   __q04, __q05, __q06, __q07,
+			   __q08, __q09, __q10, __q11,
+			   __q12, __q13, __q14, __q15);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_epi8  (char __q31, char __q30, char __q29, char __q28,
+		   char __q27, char __q26, char __q25, char __q24,
+		   char __q23, char __q22, char __q21, char __q20,
+		   char __q19, char __q18, char __q17, char __q16,
+		   char __q15, char __q14, char __q13, char __q12,
+		   char __q11, char __q10, char __q09, char __q08,
+		   char __q07, char __q06, char __q05, char __q04,
+		   char __q03, char __q02, char __q01, char __q00)
+{
+  return _mm256_set_epi8 (__q00, __q01, __q02, __q03,
+			  __q04, __q05, __q06, __q07,
+			  __q08, __q09, __q10, __q11,
+			  __q12, __q13, __q14, __q15,
+			  __q16, __q17, __q18, __q19,
+			  __q20, __q21, __q22, __q23,
+			  __q24, __q25, __q26, __q27,
+			  __q28, __q29, __q30, __q31);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_setr_epi64x (long long __A, long long __B, long long __C,
+		    long long __D)
+{
+  return _mm256_set_epi64x (__D, __C, __B, __A);
+}
+
+/* Casts between various SP, DP, INT vector types.  Note that these do no
+   conversion of values, they just change the type.  */
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpd_ps (__m256d __A)
+{
+  return (__m256) __A;
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpd_si256 (__m256d __A)
+{
+  return (__m256i) __A;
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castps_pd (__m256 __A)
+{
+  return (__m256d) __A;
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castps_si256(__m256 __A)
+{
+  return (__m256i) __A;
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castsi256_ps (__m256i __A)
+{
+  return (__m256) __A;
+}
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castsi256_pd (__m256i __A)
+{
+  return (__m256d) __A;
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpd256_pd128 (__m256d __A)
+{
+  return (__m128d) __builtin_ia32_pd_pd256 ((__v4df)__A);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castps256_ps128 (__m256 __A)
+{
+  return (__m128) __builtin_ia32_ps_ps256 ((__v8sf)__A);
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castsi256_si128 (__m256i __A)
+{
+  return (__m128i) __builtin_ia32_si_si256 ((__v8si)__A);
+}
+
+/* When cast is done from a 128 to 256-bit type, the low 128 bits of
+   the 256-bit result contain source parameter value and the upper 128
+   bits of the result are undefined.  Those intrinsics shouldn't
+   generate any extra moves.  */
+
+extern __inline __m256d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castpd128_pd256 (__m128d __A)
+{
+  return (__m256d) __builtin_ia32_pd256_pd ((__v2df)__A);
+}
+
+extern __inline __m256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castps128_ps256 (__m128 __A)
+{
+  return (__m256) __builtin_ia32_ps256_ps ((__v4sf)__A);
+}
+
+extern __inline __m256i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm256_castsi128_si256 (__m128i __A)
+{
+  return (__m256i) __builtin_ia32_si256_si ((__v4si)__A);
+}
+
+#endif /* __AVX__ */
+
+#endif /* _GMMINTRIN_H_INCLUDED */
diff --git a/gcc/config/i386/i386-c.c b/gcc/config/i386/i386-c.c
index f0a3a17..411c28d 100644
--- a/gcc/config/i386/i386-c.c
+++ b/gcc/config/i386/i386-c.c
@@ -217,6 +217,10 @@ ix86_target_macros_internal (int isa_flag,
     def_or_undef (parse_in, "__AES__");
   if (isa_flag & OPTION_MASK_ISA_PCLMUL)
     def_or_undef (parse_in, "__PCLMUL__");
+  if (isa_flag & OPTION_MASK_ISA_AVX)
+    def_or_undef (parse_in, "__AVX__");
+  if (isa_flag & OPTION_MASK_ISA_FMA)
+    def_or_undef (parse_in, "__FMA__");
   if (isa_flag & OPTION_MASK_ISA_SSE4A)
     def_or_undef (parse_in, "__SSE4A__");
   if (isa_flag & OPTION_MASK_ISA_SSE5)
diff --git a/gcc/config/i386/i386-modes.def b/gcc/config/i386/i386-modes.def
index 9be7498..f5fb906 100644
--- a/gcc/config/i386/i386-modes.def
+++ b/gcc/config/i386/i386-modes.def
@@ -73,17 +73,20 @@ CC_MODE (CCFPU);
 VECTOR_MODES (INT, 4);        /*            V4QI V2HI */
 VECTOR_MODES (INT, 8);        /*       V8QI V4HI V2SI */
 VECTOR_MODES (INT, 16);       /* V16QI V8HI V4SI V2DI */
+VECTOR_MODES (INT, 32);       /* V32QI V16HI V8SI V4DI */
 VECTOR_MODES (FLOAT, 8);      /*            V4HF V2SF */
 VECTOR_MODES (FLOAT, 16);     /*       V8HF V4SF V2DF */
+VECTOR_MODES (FLOAT, 32);     /*      V16HF V8SF V4DF */
 VECTOR_MODE (INT, DI, 1);     /*                 V1DI */
 VECTOR_MODE (INT, SI, 1);     /*                 V1SI */
 VECTOR_MODE (INT, QI, 2);     /*                 V2QI */
-VECTOR_MODE (INT, DI, 4);     /*                 V4DI */
-VECTOR_MODE (INT, SI, 8);     /*                 V8SI */
-VECTOR_MODE (INT, HI, 16);    /*                V16HI */
-VECTOR_MODE (INT, QI, 32);    /*                V32QI */
-VECTOR_MODE (FLOAT, DF, 4);   /*                 V4DF */
-VECTOR_MODE (FLOAT, SF, 8);   /*                 V8SF */
+VECTOR_MODE (INT, DI, 8);     /*                 V8DI */
+VECTOR_MODE (INT, HI, 32);    /*                V32HI */
+VECTOR_MODE (INT, QI, 64);    /*                V64QI */
+VECTOR_MODE (FLOAT, DF, 8);   /*                 V8DF */
+VECTOR_MODE (FLOAT, SF, 16);  /*                V16SF */
+
+INT_MODE (OI, 32);
 
 /* The symbol Pmode stands for one of the above machine modes (usually SImode).
    The tm.h file specifies which one.  It is not a distinct mode.  */
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index 3276bd8..5f055ab 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -128,6 +128,7 @@ extern int ix86_check_movabs (rtx, int);
 extern rtx assign_386_stack_local (enum machine_mode, enum ix86_stack_slot);
 extern int ix86_attr_length_immediate_default (rtx, int);
 extern int ix86_attr_length_address_default (rtx);
+extern int ix86_attr_length_vex_default (rtx, int, int);
 
 extern enum machine_mode ix86_fp_compare_mode (enum rtx_code);
 
diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c
index 2f0392bf..4d45c84 100644
--- a/gcc/config/i386/i386.c
+++ b/gcc/config/i386/i386.c
@@ -1764,6 +1764,7 @@ enum x86_64_reg_class
     X86_64_NO_CLASS,
     X86_64_INTEGER_CLASS,
     X86_64_INTEGERSI_CLASS,
+    X86_64_AVX_CLASS,
     X86_64_SSE_CLASS,
     X86_64_SSESF_CLASS,
     X86_64_SSEDF_CLASS,
@@ -1849,6 +1850,10 @@ static int ix86_isa_flags_explicit;
   (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSSE3_SET)
 #define OPTION_MASK_ISA_SSE4_2_SET \
   (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_SSE4_1_SET)
+#define OPTION_MASK_ISA_AVX_SET \
+  (OPTION_MASK_ISA_AVX | OPTION_MASK_ISA_SSE4_2_SET)
+#define OPTION_MASK_ISA_FMA_SET \
+  (OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_AVX_SET)
 
 /* SSE4 includes both SSE4.1 and SSE4.2. -msse4 should be the same
    as -msse4.2.  */
@@ -1892,7 +1897,11 @@ static int ix86_isa_flags_explicit;
   (OPTION_MASK_ISA_SSSE3 | OPTION_MASK_ISA_SSE4_1_UNSET)
 #define OPTION_MASK_ISA_SSE4_1_UNSET \
   (OPTION_MASK_ISA_SSE4_1 | OPTION_MASK_ISA_SSE4_2_UNSET)
-#define OPTION_MASK_ISA_SSE4_2_UNSET OPTION_MASK_ISA_SSE4_2
+#define OPTION_MASK_ISA_SSE4_2_UNSET \
+  (OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_AVX_UNSET )
+#define OPTION_MASK_ISA_AVX_UNSET \
+  (OPTION_MASK_ISA_AVX | OPTION_MASK_ISA_FMA_UNSET)
+#define OPTION_MASK_ISA_FMA_UNSET OPTION_MASK_ISA_FMA
 
 /* SSE4 includes both SSE4.1 and SSE4.2.  -mno-sse4 should the same
    as -mno-sse4.1. */
@@ -2081,6 +2090,32 @@ ix86_handle_option (size_t code, const char *arg ATTRIBUTE_UNUSED, int value)
 	}
       return true;
 
+    case OPT_mavx:
+      if (value)
+	{
+	  ix86_isa_flags |= OPTION_MASK_ISA_AVX_SET;
+	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_AVX_SET;
+	}
+      else
+	{
+	  ix86_isa_flags &= ~OPTION_MASK_ISA_AVX_UNSET;
+	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_AVX_UNSET;
+	}
+      return true;
+
+    case OPT_mfma:
+      if (value)
+	{
+	  ix86_isa_flags |= OPTION_MASK_ISA_FMA_SET;
+	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_FMA_SET;
+	}
+      else
+	{
+	  ix86_isa_flags &= ~OPTION_MASK_ISA_FMA_UNSET;
+	  ix86_isa_flags_explicit |= OPTION_MASK_ISA_FMA_UNSET;
+	}
+      return true;
+
     case OPT_msse4:
       ix86_isa_flags |= OPTION_MASK_ISA_SSE4_SET;
       ix86_isa_flags_explicit |= OPTION_MASK_ISA_SSE4_SET;
@@ -2447,7 +2482,9 @@ override_options (bool main_args_p)
       PTA_SSE4_2 = 1 << 15,
       PTA_SSE5 = 1 << 16,
       PTA_AES = 1 << 17,
-      PTA_PCLMUL = 1 << 18
+      PTA_PCLMUL = 1 << 18,
+      PTA_AVX = 1 << 19,
+      PTA_FMA = 1 << 20 
     };
 
   static struct pta
@@ -2765,6 +2802,12 @@ override_options (bool main_args_p)
 	if (processor_alias_table[i].flags & PTA_SSE4_2
 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4_2))
 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE4_2;
+	if (processor_alias_table[i].flags & PTA_AVX
+	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_AVX))
+	  ix86_isa_flags |= OPTION_MASK_ISA_AVX;
+	if (processor_alias_table[i].flags & PTA_FMA
+	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_FMA))
+	  ix86_isa_flags |= OPTION_MASK_ISA_FMA;
 	if (processor_alias_table[i].flags & PTA_SSE4A
 	    && !(ix86_isa_flags_explicit & OPTION_MASK_ISA_SSE4A))
 	  ix86_isa_flags |= OPTION_MASK_ISA_SSE4A;
@@ -4587,6 +4630,7 @@ init_cumulative_args (CUMULATIVE_ARGS *cum,  /* Argument info to initialize */
     }
   if (TARGET_MMX)
     cum->mmx_nregs = MMX_REGPARM_MAX;
+  cum->warn_avx = true;
   cum->warn_sse = true;
   cum->warn_mmx = true;
 
@@ -4611,6 +4655,7 @@ init_cumulative_args (CUMULATIVE_ARGS *cum,  /* Argument info to initialize */
 	  cum->nregs = 0;
 	  cum->sse_nregs = 0;
 	  cum->mmx_nregs = 0;
+	  cum->warn_avx = 0;
 	  cum->warn_sse = 0;
 	  cum->warn_mmx = 0;
 	  return;
@@ -4963,6 +5008,8 @@ classify_argument (enum machine_mode mode, const_tree type,
       classes[0] = classes[1] = X86_64_INTEGER_CLASS;
       return 2;
     case CTImode:
+    case COImode:
+    case OImode:
       return 0;
     case SFmode:
       if (!(bit_offset % 64))
@@ -4994,6 +5041,14 @@ classify_argument (enum machine_mode mode, const_tree type,
     case TCmode:
       /* This modes is larger than 16 bytes.  */
       return 0;
+    case V8SFmode:
+    case V8SImode:
+    case V32QImode:
+    case V16HImode:
+    case V4DFmode:
+    case V4DImode:
+      classes[0] = X86_64_AVX_CLASS;
+      return 1;
     case V4SFmode:
     case V4SImode:
     case V16QImode:
@@ -5050,6 +5105,7 @@ examine_argument (enum machine_mode mode, const_tree type, int in_return,
       case X86_64_INTEGERSI_CLASS:
 	(*int_nregs)++;
 	break;
+      case X86_64_AVX_CLASS:
       case X86_64_SSE_CLASS:
       case X86_64_SSESF_CLASS:
       case X86_64_SSEDF_CLASS:
@@ -5148,6 +5204,7 @@ construct_container (enum machine_mode mode, enum machine_mode orig_mode,
       case X86_64_INTEGER_CLASS:
       case X86_64_INTEGERSI_CLASS:
 	return gen_rtx_REG (mode, intreg[0]);
+      case X86_64_AVX_CLASS:
       case X86_64_SSE_CLASS:
       case X86_64_SSESF_CLASS:
       case X86_64_SSEDF_CLASS:
@@ -5281,6 +5338,13 @@ function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
 	break;
       /* FALLTHRU */
 
+    case OImode:
+    case V8SFmode:
+    case V8SImode:
+    case V32QImode:
+    case V16HImode:
+    case V4DFmode:
+    case V4DImode:
     case TImode:
     case V16QImode:
     case V8HImode:
@@ -5323,10 +5387,14 @@ function_arg_advance_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
 
 static void
 function_arg_advance_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
-			 tree type, HOST_WIDE_INT words)
+			 tree type, HOST_WIDE_INT words, int named)
 {
   int int_nregs, sse_nregs;
 
+  /* Unnamed 256bit vector mode parameters are passed on stack.  */
+  if (!named && VALID_AVX256_REG_MODE (mode))
+    return;
+
   if (!examine_argument (mode, type, 0, &int_nregs, &sse_nregs))
     cum->words += words;
   else if (sse_nregs <= cum->sse_nregs && int_nregs <= cum->nregs)
@@ -5357,7 +5425,7 @@ function_arg_advance_ms_64 (CUMULATIVE_ARGS *cum, HOST_WIDE_INT bytes,
 
 void
 function_arg_advance (CUMULATIVE_ARGS *cum, enum machine_mode mode,
-		      tree type, int named ATTRIBUTE_UNUSED)
+		      tree type, int named)
 {
   HOST_WIDE_INT bytes, words;
 
@@ -5373,7 +5441,7 @@ function_arg_advance (CUMULATIVE_ARGS *cum, enum machine_mode mode,
   if (TARGET_64BIT && (cum ? cum->call_abi : DEFAULT_ABI) == MS_ABI)
     function_arg_advance_ms_64 (cum, bytes, words);
   else if (TARGET_64BIT)
-    function_arg_advance_64 (cum, mode, type, words);
+    function_arg_advance_64 (cum, mode, type, words, named);
   else
     function_arg_advance_32 (cum, mode, type, bytes, words);
 }
@@ -5396,7 +5464,7 @@ function_arg_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
 		 enum machine_mode orig_mode, tree type,
 		 HOST_WIDE_INT bytes, HOST_WIDE_INT words)
 {
-  static bool warnedsse, warnedmmx;
+  static bool warnedavx, warnedsse, warnedmmx;
 
   /* Avoid the AL settings for the Unix64 ABI.  */
   if (mode == VOIDmode)
@@ -5445,6 +5513,7 @@ function_arg_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
 	break;
       /* FALLTHRU */
     case TImode:
+      /* In 32bit, we pass TImode in xmm registers.  */
     case V16QImode:
     case V8HImode:
     case V4SImode:
@@ -5465,6 +5534,28 @@ function_arg_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
 	}
       break;
 
+    case OImode:
+      /* In 32bit, we pass OImode in ymm registers.  */
+    case V8SFmode:
+    case V8SImode:
+    case V32QImode:
+    case V16HImode:
+    case V4DFmode:
+    case V4DImode:
+      if (!type || !AGGREGATE_TYPE_P (type))
+	{
+	  if (!TARGET_AVX && !warnedavx && cum->warn_avx)
+	    {
+	      warnedavx = true;
+	      warning (0, "AVX vector argument without AVX enabled "
+		       "changes the ABI");
+	    }
+	  if (cum->sse_nregs)
+	    return gen_reg_or_parallel (mode, orig_mode,
+				        cum->sse_regno + FIRST_SSE_REG);
+	}
+      break;
+
     case V8QImode:
     case V4HImode:
     case V2SImode:
@@ -5490,8 +5581,10 @@ function_arg_32 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
 
 static rtx
 function_arg_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
-		 enum machine_mode orig_mode, tree type)
+		 enum machine_mode orig_mode, tree type, int named)
 {
+  static bool warnedavx;
+
   /* Handle a hidden AL argument containing number of registers
      for varargs x86-64 functions.  */
   if (mode == VOIDmode)
@@ -5504,6 +5597,35 @@ function_arg_64 (CUMULATIVE_ARGS *cum, enum machine_mode mode,
  	       : cum->sse_regno)
 		    : -1);
 
+  switch (mode)
+    {
+    default:
+      break;
+
+    case V8SFmode:
+    case V8SImode:
+    case V32QImode:
+    case V16HImode:
+    case V4DFmode:
+    case V4DImode:
+      /* In 64bit, we pass TImode in interger registers and OImode on
+	 stack.  */
+      if (!type || !AGGREGATE_TYPE_P (type))
+	{
+	  if (!TARGET_AVX && !warnedavx && cum->warn_avx)
+	    {
+	      warnedavx = true;
+	      warning (0, "AVX vector argument without AVX enabled "
+		       "changes the ABI");
+	    }
+	}
+
+      /* Unnamed 256bit vector mode parameters are passed on stack.  */
+      if (!named)
+	return NULL;
+      break;
+    }
+
   return construct_container (mode, orig_mode, type, 0, cum->nregs,
 			      cum->sse_nregs,
 			      &x86_64_int_parameter_registers [cum->regno],
@@ -5578,7 +5700,7 @@ function_arg (CUMULATIVE_ARGS *cum, enum machine_mode omode,
   if (TARGET_64BIT && (cum ? cum->call_abi : DEFAULT_ABI) == MS_ABI)
     return function_arg_ms_64 (cum, mode, omode, named, bytes);
   else if (TARGET_64BIT)
-    return function_arg_64 (cum, mode, omode, type);
+    return function_arg_64 (cum, mode, omode, type, named);
   else
     return function_arg_32 (cum, mode, omode, type, bytes, words);
 }
@@ -6202,27 +6324,37 @@ setup_incoming_varargs_64 (CUMULATIVE_ARGS *cum)
       label_ref = gen_rtx_LABEL_REF (Pmode, label);
 
       /* Compute address to jump to :
-         label - eax*4 + nnamed_sse_arguments*4  */
+         label - eax*4 + nnamed_sse_arguments*4 Or
+         label - eax*5 + nnamed_sse_arguments*5 for AVX.  */
       tmp_reg = gen_reg_rtx (Pmode);
       nsse_reg = gen_reg_rtx (Pmode);
       emit_insn (gen_zero_extendqidi2 (nsse_reg, gen_rtx_REG (QImode, AX_REG)));
       emit_insn (gen_rtx_SET (VOIDmode, tmp_reg,
 			      gen_rtx_MULT (Pmode, nsse_reg,
 					    GEN_INT (4))));
+
+      /* vmovaps is one byte longer than movaps.  */
+      if (TARGET_AVX)
+	emit_insn (gen_rtx_SET (VOIDmode, tmp_reg,
+				gen_rtx_PLUS (Pmode, tmp_reg,
+					      nsse_reg)));
+
       if (cum->sse_regno)
 	emit_move_insn
 	  (nsse_reg,
 	   gen_rtx_CONST (DImode,
 			  gen_rtx_PLUS (DImode,
 					label_ref,
-					GEN_INT (cum->sse_regno * 4))));
+					GEN_INT (cum->sse_regno
+						 * (TARGET_AVX ? 5 : 4)))));
       else
 	emit_move_insn (nsse_reg, label_ref);
       emit_insn (gen_subdi3 (nsse_reg, nsse_reg, tmp_reg));
 
       /* Compute address of memory block we save into.  We always use pointer
 	 pointing 127 bytes after first byte to store - this is needed to keep
-	 instruction size limited by 4 bytes.  */
+	 instruction size limited by 4 bytes (5 bytes for AVX) with one
+	 byte displacement.  */
       tmp_reg = gen_reg_rtx (Pmode);
       emit_insn (gen_rtx_SET (VOIDmode, tmp_reg,
 			      plus_constant (save_area,
@@ -6416,9 +6548,28 @@ ix86_gimplify_va_arg (tree valist, tree type, gimple_seq *pre_p,
   rsize = (size + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
 
   nat_mode = type_natural_mode (type);
-  container = construct_container (nat_mode, TYPE_MODE (type), type, 0,
-				   X86_64_REGPARM_MAX, X86_64_SSE_REGPARM_MAX,
-				   intreg, 0);
+  switch (nat_mode)
+    {
+    case V8SFmode:
+    case V8SImode:
+    case V32QImode:
+    case V16HImode:
+    case V4DFmode:
+    case V4DImode:
+      /* Unnamed 256bit vector mode parameters are passed on stack.  */
+      if (ix86_cfun_abi () == SYSV_ABI)
+	{
+	  container = NULL;
+	  break;
+	}
+
+    default:
+      container = construct_container (nat_mode, TYPE_MODE (type),
+				       type, 0, X86_64_REGPARM_MAX,
+				       X86_64_SSE_REGPARM_MAX, intreg,
+				       0);
+      break;
+    }
 
   /* Pull the value out of the saved registers.  */
 
@@ -6793,8 +6944,10 @@ standard_sse_mode_p (enum machine_mode mode)
     }
 }
 
-/* Return 1 if X is FP constant we can load to SSE register w/o using memory.
- */
+/* Return 1 if X is all 0s.  For all 1s, return 2 if X is in 128bit
+   SSE modes and SSE2 is enabled,  return 3 if X is in 256bit AVX
+   modes and AVX is enabled.  */
+
 int
 standard_sse_constant_p (rtx x)
 {
@@ -6802,9 +6955,13 @@ standard_sse_constant_p (rtx x)
 
   if (x == const0_rtx || x == CONST0_RTX (GET_MODE (x)))
     return 1;
-  if (vector_all_ones_operand (x, mode)
-      && standard_sse_mode_p (mode))
-    return TARGET_SSE2 ? 2 : -1;
+  if (vector_all_ones_operand (x, mode))
+    {
+      if (standard_sse_mode_p (mode))
+	return TARGET_SSE2 ? 2 : -2;
+      else if (VALID_AVX256_REG_MODE (mode))
+	return TARGET_AVX ? 3 : -3;
+    }
 
   return 0;
 }
@@ -6818,14 +6975,37 @@ standard_sse_constant_opcode (rtx insn, rtx x)
   switch (standard_sse_constant_p (x))
     {
     case 1:
-      if (get_attr_mode (insn) == MODE_V4SF)
-        return "xorps\t%0, %0";
-      else if (get_attr_mode (insn) == MODE_V2DF)
-        return "xorpd\t%0, %0";
-      else
-        return "pxor\t%0, %0";
+      switch (get_attr_mode (insn))
+	{
+	case MODE_V4SF:
+	  return TARGET_AVX ? "vxorps\t%0, %0, %0" : "xorps\t%0, %0";
+	case MODE_V2DF:
+	  return TARGET_AVX ? "vxorpd\t%0, %0, %0" : "xorpd\t%0, %0";
+	case MODE_TI:
+	  return TARGET_AVX ? "vpxor\t%0, %0, %0" : "pxor\t%0, %0";
+	case MODE_V8SF:
+	  return "vxorps\t%x0, %x0, %x0";
+	case MODE_V4DF:
+	  return "vxorpd\t%x0, %x0, %x0";
+	case MODE_OI:
+	  return "vpxor\t%x0, %x0, %x0";
+	default:
+	  gcc_unreachable ();
+	}
     case 2:
-      return "pcmpeqd\t%0, %0";
+      if (TARGET_AVX)
+	switch (get_attr_mode (insn))
+	  {
+	  case MODE_V4SF:
+	  case MODE_V2DF:
+	  case MODE_TI:
+	    return "vpcmpeqd\t%0, %0, %0";
+	    break;
+	  default:
+	    gcc_unreachable ();
+	}
+      else
+	return "pcmpeqd\t%0, %0";
     }
   gcc_unreachable ();
 }
@@ -10035,12 +10215,19 @@ put_condition_code (enum rtx_code code, enum machine_mode mode, int reverse,
    If CODE is 'b', pretend the mode is QImode.
    If CODE is 'k', pretend the mode is SImode.
    If CODE is 'q', pretend the mode is DImode.
+   If CODE is 'x', pretend the mode is V4SFmode.
+   If CODE is 't', pretend the mode is V8SFmode.
    If CODE is 'h', pretend the reg is the 'high' byte register.
-   If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.  */
+   If CODE is 'y', print "st(0)" instead of "st", if the reg is stack op.
+   If CODE is 'd', duplicate the operand for AVX instruction.
+ */
 
 void
 print_reg (rtx x, int code, FILE *file)
 {
+  const char *reg;
+  bool duplicated = code == 'd' && TARGET_AVX;
+
   gcc_assert (x == pc_rtx
 	      || (REGNO (x) != ARG_POINTER_REGNUM
 		  && REGNO (x) != FRAME_POINTER_REGNUM
@@ -10070,6 +10257,10 @@ print_reg (rtx x, int code, FILE *file)
     code = 3;
   else if (code == 'h')
     code = 0;
+  else if (code == 'x')
+    code = 16;
+  else if (code == 't')
+    code = 32;
   else
     code = GET_MODE_SIZE (GET_MODE (x));
 
@@ -10101,12 +10292,14 @@ print_reg (rtx x, int code, FILE *file)
 	}
       return;
     }
+
+  reg = NULL;
   switch (code)
     {
     case 3:
       if (STACK_TOP_P (x))
 	{
-	  fputs ("st(0)", file);
+	  reg = "st(0)";
 	  break;
 	}
       /* FALLTHRU */
@@ -10119,21 +10312,39 @@ print_reg (rtx x, int code, FILE *file)
     case 16:
     case 2:
     normal:
-      fputs (hi_reg_name[REGNO (x)], file);
+      reg = hi_reg_name[REGNO (x)];
       break;
     case 1:
       if (REGNO (x) >= ARRAY_SIZE (qi_reg_name))
 	goto normal;
-      fputs (qi_reg_name[REGNO (x)], file);
+      reg = qi_reg_name[REGNO (x)];
       break;
     case 0:
       if (REGNO (x) >= ARRAY_SIZE (qi_high_reg_name))
 	goto normal;
-      fputs (qi_high_reg_name[REGNO (x)], file);
+      reg = qi_high_reg_name[REGNO (x)];
+      break;
+    case 32:
+      if (SSE_REG_P (x))
+	{
+	  gcc_assert (!duplicated);
+	  putc ('y', file);
+	  fputs (hi_reg_name[REGNO (x)] + 1, file);
+	  return;
+	}
       break;
     default:
       gcc_unreachable ();
     }
+
+  fputs (reg, file);
+  if (duplicated)
+    {
+      if (ASSEMBLER_DIALECT == ASM_ATT)
+	fprintf (file, ", %%%s", reg);
+      else
+	fprintf (file, ", %s", reg);
+    }
 }
 
 /* Locate some local-dynamic symbol still in use by this function
@@ -10191,8 +10402,11 @@ get_some_local_dynamic_name (void)
    w --  likewise, print the HImode name of the register.
    k --  likewise, print the SImode name of the register.
    q --  likewise, print the DImode name of the register.
+   x --  likewise, print the V4SFmode name of the register.
+   t --  likewise, print the V8SFmode name of the register.
    h -- print the QImode name for a "high" register, either ah, bh, ch or dh.
    y -- print "st(0)" instead of "st" as a register.
+   d -- print duplicated register operand for AVX instruction.
    D -- print condition for SSE cmp instruction.
    P -- if PIC, print an @PLT suffix.
    X -- don't print any sort of PIC '@' suffix for a symbol.
@@ -10343,12 +10557,15 @@ print_operand (FILE *file, rtx x, int code)
 	      gcc_unreachable ();
 	    }
 
+	case 'd':
 	case 'b':
 	case 'w':
 	case 'k':
 	case 'q':
 	case 'h':
+	case 't':
 	case 'y':
+	case 'x':
 	case 'X':
 	case 'P':
 	  break;
@@ -10365,40 +10582,93 @@ print_operand (FILE *file, rtx x, int code)
 	  /* Little bit of braindamage here.  The SSE compare instructions
 	     does use completely different names for the comparisons that the
 	     fp conditional moves.  */
-	  switch (GET_CODE (x))
+	  if (TARGET_AVX)
 	    {
-	    case EQ:
-	    case UNEQ:
-	      fputs ("eq", file);
-	      break;
-	    case LT:
-	    case UNLT:
-	      fputs ("lt", file);
-	      break;
-	    case LE:
-	    case UNLE:
-	      fputs ("le", file);
-	      break;
-	    case UNORDERED:
-	      fputs ("unord", file);
-	      break;
-	    case NE:
-	    case LTGT:
-	      fputs ("neq", file);
-	      break;
-	    case UNGE:
-	    case GE:
-	      fputs ("nlt", file);
-	      break;
-	    case UNGT:
-	    case GT:
-	      fputs ("nle", file);
-	      break;
-	    case ORDERED:
-	      fputs ("ord", file);
-	      break;
-	    default:
-	      gcc_unreachable ();
+	      switch (GET_CODE (x))
+		{
+		case EQ:
+		  fputs ("eq", file);
+		  break;
+		case UNEQ:
+		  fputs ("eq_us", file);
+		  break;
+		case LT:
+		  fputs ("lt", file);
+		  break;
+		case UNLT:
+		  fputs ("nge", file);
+		  break;
+		case LE:
+		  fputs ("le", file);
+		  break;
+		case UNLE:
+		  fputs ("ngt", file);
+		  break;
+		case UNORDERED:
+		  fputs ("unord", file);
+		  break;
+		case NE:
+		  fputs ("neq", file);
+		  break;
+		case LTGT:
+		  fputs ("neq_oq", file);
+		  break;
+		case GE:
+		  fputs ("ge", file);
+		  break;
+		case UNGE:
+		  fputs ("nlt", file);
+		  break;
+		case GT:
+		  fputs ("gt", file);
+		  break;
+		case UNGT:
+		  fputs ("nle", file);
+		  break;
+		case ORDERED:
+		  fputs ("ord", file);
+		  break;
+		default:
+		  gcc_unreachable ();
+		}
+	    }
+	  else
+	    {
+	      switch (GET_CODE (x))
+		{
+		case EQ:
+		case UNEQ:
+		  fputs ("eq", file);
+		  break;
+		case LT:
+		case UNLT:
+		  fputs ("lt", file);
+		  break;
+		case LE:
+		case UNLE:
+		  fputs ("le", file);
+		  break;
+		case UNORDERED:
+		  fputs ("unord", file);
+		  break;
+		case NE:
+		case LTGT:
+		  fputs ("neq", file);
+		  break;
+		case UNGE:
+		case GE:
+		  fputs ("nlt", file);
+		  break;
+		case UNGT:
+		case GT:
+		  fputs ("nle", file);
+		  break;
+		case ORDERED:
+		  fputs ("ord", file);
+		  break;
+		default:
+		  gcc_unreachable ();
+		}
 	    }
 	  return;
 	case 'O':
@@ -10951,7 +11221,7 @@ split_ti (rtx operands[], int num, rtx lo_half[], rtx hi_half[])
 const char *
 output_387_binary_op (rtx insn, rtx *operands)
 {
-  static char buf[30];
+  static char buf[40];
   const char *p;
   const char *ssep;
   int is_sse = SSE_REG_P (operands[0]) || SSE_REG_P (operands[1]) || SSE_REG_P (operands[2]);
@@ -10980,7 +11250,7 @@ output_387_binary_op (rtx insn, rtx *operands)
 	p = "fiadd";
       else
 	p = "fadd";
-      ssep = "add";
+      ssep = "vadd";
       break;
 
     case MINUS:
@@ -10989,7 +11259,7 @@ output_387_binary_op (rtx insn, rtx *operands)
 	p = "fisub";
       else
 	p = "fsub";
-      ssep = "sub";
+      ssep = "vsub";
       break;
 
     case MULT:
@@ -10998,7 +11268,7 @@ output_387_binary_op (rtx insn, rtx *operands)
 	p = "fimul";
       else
 	p = "fmul";
-      ssep = "mul";
+      ssep = "vmul";
       break;
 
     case DIV:
@@ -11007,7 +11277,7 @@ output_387_binary_op (rtx insn, rtx *operands)
 	p = "fidiv";
       else
 	p = "fdiv";
-      ssep = "div";
+      ssep = "vdiv";
       break;
 
     default:
@@ -11016,11 +11286,22 @@ output_387_binary_op (rtx insn, rtx *operands)
 
   if (is_sse)
    {
-      strcpy (buf, ssep);
-      if (GET_MODE (operands[0]) == SFmode)
-	strcat (buf, "ss\t{%2, %0|%0, %2}");
-      else
-	strcat (buf, "sd\t{%2, %0|%0, %2}");
+     if (TARGET_AVX)
+       {
+	 strcpy (buf, ssep);
+	 if (GET_MODE (operands[0]) == SFmode)
+	   strcat (buf, "ss\t{%2, %1, %0|%0, %1, %2}");
+	 else
+	   strcat (buf, "sd\t{%2, %1, %0|%0, %1, %2}");
+       }
+     else
+       {
+	 strcpy (buf, ssep + 1);
+	 if (GET_MODE (operands[0]) == SFmode)
+	   strcat (buf, "ss\t{%2, %0|%0, %2}");
+	 else
+	   strcat (buf, "sd\t{%2, %0|%0, %2}");
+       }
       return buf;
    }
   strcpy (buf, p);
@@ -11382,16 +11663,21 @@ output_fp_compare (rtx insn, rtx *operands, int eflags_p, int unordered_p)
 
   if (is_sse)
     {
+      static const char ucomiss[] = "vucomiss\t{%1, %0|%0, %1}";
+      static const char ucomisd[] = "vucomisd\t{%1, %0|%0, %1}";
+      static const char comiss[] = "vcomiss\t{%1, %0|%0, %1}";
+      static const char comisd[] = "vcomisd\t{%1, %0|%0, %1}";
+
       if (GET_MODE (operands[0]) == SFmode)
 	if (unordered_p)
-	  return "ucomiss\t{%1, %0|%0, %1}";
+	  return &ucomiss[TARGET_AVX ? 0 : 1];
 	else
-	  return "comiss\t{%1, %0|%0, %1}";
+	  return &comiss[TARGET_AVX ? 0 : 1];
       else
 	if (unordered_p)
-	  return "ucomisd\t{%1, %0|%0, %1}";
+	  return &ucomisd[TARGET_AVX ? 0 : 1];
 	else
-	  return "comisd\t{%1, %0|%0, %1}";
+	  return &comisd[TARGET_AVX ? 0 : 1];
     }
 
   gcc_assert (STACK_TOP_P (cmp_op0));
@@ -11805,6 +12091,58 @@ ix86_expand_vector_move_misalign (enum machine_mode mode, rtx operands[])
   op0 = operands[0];
   op1 = operands[1];
 
+  if (TARGET_AVX)
+    {
+      switch (GET_MODE_CLASS (mode))
+	{
+	case MODE_VECTOR_INT:
+	case MODE_INT:
+	  switch (GET_MODE_SIZE (mode))
+	    {
+	    case 16:
+	      op0 = gen_lowpart (V16QImode, op0);
+	      op1 = gen_lowpart (V16QImode, op1);
+	      emit_insn (gen_avx_movdqu (op0, op1));
+	      break;
+	    case 32:
+	      op0 = gen_lowpart (V32QImode, op0);
+	      op1 = gen_lowpart (V32QImode, op1);
+	      emit_insn (gen_avx_movdqu256 (op0, op1));
+	      break;
+	    default:
+	      gcc_unreachable ();
+	    }
+	  break;
+	case MODE_VECTOR_FLOAT:
+	  op0 = gen_lowpart (mode, op0);
+	  op1 = gen_lowpart (mode, op1);
+
+	  switch (mode)
+	    { 
+	    case V4SFmode:
+	      emit_insn (gen_avx_movups (op0, op1));
+	      break;
+	    case V8SFmode:
+	      emit_insn (gen_avx_movups256 (op0, op1));
+	      break;
+	    case V2DFmode:
+	      emit_insn (gen_avx_movupd (op0, op1));
+	      break;
+	    case V4DFmode:
+	      emit_insn (gen_avx_movupd256 (op0, op1));
+	      break;
+	    default:
+	      gcc_unreachable ();
+	    }
+	  break;
+
+	default:
+	  gcc_unreachable ();
+	}
+
+      return;
+    }
+
   if (MEM_P (op1))
     {
       /* If we're optimizing for size, movups is the smallest.  */
@@ -17927,6 +18265,44 @@ ix86_attr_length_address_default (rtx insn)
       }
   return 0;
 }
+
+/* Compute default value for "length_vex" attribute. It includes
+   2 or 3 byte VEX prefix and 1 opcode byte.  */
+
+int
+ix86_attr_length_vex_default (rtx insn, int has_0f_opcode,
+			      int has_vex_w)
+{
+  int i;
+
+  /* Only 0f opcode can use 2 byte VEX prefix and  VEX W bit uses 3
+     byte VEX prefix.  */
+  if (!has_0f_opcode || has_vex_w)
+    return 3 + 1;
+
+ /* We can always use 2 byte VEX prefix in 32bit.  */
+  if (!TARGET_64BIT)
+    return 2 + 1;
+
+  extract_insn_cached (insn);
+
+  for (i = recog_data.n_operands - 1; i >= 0; --i)
+    if (REG_P (recog_data.operand[i]))
+      {
+	/* REX.W bit uses 3 byte VEX prefix.  */
+	if (GET_MODE (recog_data.operand[i]) == DImode)
+	  return 3 + 1;
+      }
+    else
+      {
+	/* REX.X or REX.B bits use 3 byte VEX prefix.  */
+	if (MEM_P (recog_data.operand[i])
+	    && x86_extended_reg_mentioned_p (recog_data.operand[i]))
+	  return 3 + 1;
+      }
+
+  return 2 + 1;
+}
 
 /* Return the maximum number of instructions a cpu can issue.  */
 
@@ -18994,6 +19370,144 @@ enum ix86_builtins
   /* PCLMUL instruction */
   IX86_BUILTIN_PCLMULQDQ128,
 
+  /* AVX */
+  IX86_BUILTIN_ADDPD256,
+  IX86_BUILTIN_ADDPS256,
+  IX86_BUILTIN_ADDSUBPD256,
+  IX86_BUILTIN_ADDSUBPS256,
+  IX86_BUILTIN_ANDPD256,
+  IX86_BUILTIN_ANDPS256,
+  IX86_BUILTIN_ANDNPD256,
+  IX86_BUILTIN_ANDNPS256,
+  IX86_BUILTIN_BLENDPD256,
+  IX86_BUILTIN_BLENDPS256,
+  IX86_BUILTIN_BLENDVPD256,
+  IX86_BUILTIN_BLENDVPS256,
+  IX86_BUILTIN_DIVPD256,
+  IX86_BUILTIN_DIVPS256,
+  IX86_BUILTIN_DPPS256,
+  IX86_BUILTIN_HADDPD256,
+  IX86_BUILTIN_HADDPS256,
+  IX86_BUILTIN_HSUBPD256,
+  IX86_BUILTIN_HSUBPS256,
+  IX86_BUILTIN_MAXPD256,
+  IX86_BUILTIN_MAXPS256,
+  IX86_BUILTIN_MINPD256,
+  IX86_BUILTIN_MINPS256,
+  IX86_BUILTIN_MULPD256,
+  IX86_BUILTIN_MULPS256,
+  IX86_BUILTIN_ORPD256,
+  IX86_BUILTIN_ORPS256,
+  IX86_BUILTIN_SHUFPD256,
+  IX86_BUILTIN_SHUFPS256,
+  IX86_BUILTIN_SUBPD256,
+  IX86_BUILTIN_SUBPS256,
+  IX86_BUILTIN_XORPD256,
+  IX86_BUILTIN_XORPS256,
+  IX86_BUILTIN_CMPSD,
+  IX86_BUILTIN_CMPSS,
+  IX86_BUILTIN_CMPPD,
+  IX86_BUILTIN_CMPPS,
+  IX86_BUILTIN_CMPPD256,
+  IX86_BUILTIN_CMPPS256,
+  IX86_BUILTIN_CVTDQ2PD256,
+  IX86_BUILTIN_CVTDQ2PS256,
+  IX86_BUILTIN_CVTPD2PS256,
+  IX86_BUILTIN_CVTPS2DQ256,
+  IX86_BUILTIN_CVTPS2PD256,
+  IX86_BUILTIN_CVTTPD2DQ256,
+  IX86_BUILTIN_CVTPD2DQ256,
+  IX86_BUILTIN_CVTTPS2DQ256,
+  IX86_BUILTIN_EXTRACTF128PD256,
+  IX86_BUILTIN_EXTRACTF128PS256,
+  IX86_BUILTIN_EXTRACTF128SI256,
+  IX86_BUILTIN_VZEROALL,
+  IX86_BUILTIN_VZEROUPPER,
+  IX86_BUILTIN_VZEROUPPER_REX64,
+  IX86_BUILTIN_VPERMILVARPD,
+  IX86_BUILTIN_VPERMILVARPS,
+  IX86_BUILTIN_VPERMILVARPD256,
+  IX86_BUILTIN_VPERMILVARPS256,
+  IX86_BUILTIN_VPERMILPD,
+  IX86_BUILTIN_VPERMILPS,
+  IX86_BUILTIN_VPERMILPD256,
+  IX86_BUILTIN_VPERMILPS256,
+  IX86_BUILTIN_VPERMIL2PD,
+  IX86_BUILTIN_VPERMIL2PS,
+  IX86_BUILTIN_VPERMIL2PD256,
+  IX86_BUILTIN_VPERMIL2PS256,
+  IX86_BUILTIN_VPERM2F128PD256,
+  IX86_BUILTIN_VPERM2F128PS256,
+  IX86_BUILTIN_VPERM2F128SI256,
+  IX86_BUILTIN_VBROADCASTSS,
+  IX86_BUILTIN_VBROADCASTSD256,
+  IX86_BUILTIN_VBROADCASTSS256,
+  IX86_BUILTIN_VBROADCASTPD256,
+  IX86_BUILTIN_VBROADCASTPS256,
+  IX86_BUILTIN_VINSERTF128PD256,
+  IX86_BUILTIN_VINSERTF128PS256,
+  IX86_BUILTIN_VINSERTF128SI256,
+  IX86_BUILTIN_LOADUPD256,
+  IX86_BUILTIN_LOADUPS256,
+  IX86_BUILTIN_STOREUPD256,
+  IX86_BUILTIN_STOREUPS256,
+  IX86_BUILTIN_LDDQU256,
+  IX86_BUILTIN_LOADDQU256,
+  IX86_BUILTIN_STOREDQU256,
+  IX86_BUILTIN_MASKLOADPD,
+  IX86_BUILTIN_MASKLOADPS,
+  IX86_BUILTIN_MASKSTOREPD,
+  IX86_BUILTIN_MASKSTOREPS,
+  IX86_BUILTIN_MASKLOADPD256,
+  IX86_BUILTIN_MASKLOADPS256,
+  IX86_BUILTIN_MASKSTOREPD256,
+  IX86_BUILTIN_MASKSTOREPS256,
+  IX86_BUILTIN_MOVSHDUP256,
+  IX86_BUILTIN_MOVSLDUP256,
+  IX86_BUILTIN_MOVDDUP256,
+
+  IX86_BUILTIN_SQRTPD256,
+  IX86_BUILTIN_SQRTPS256,
+  IX86_BUILTIN_SQRTPS_NR256,
+  IX86_BUILTIN_RSQRTPS256,
+  IX86_BUILTIN_RSQRTPS_NR256,
+
+  IX86_BUILTIN_RCPPS256,
+
+  IX86_BUILTIN_ROUNDPD256,
+  IX86_BUILTIN_ROUNDPS256,
+
+  IX86_BUILTIN_UNPCKHPD256,
+  IX86_BUILTIN_UNPCKLPD256,
+  IX86_BUILTIN_UNPCKHPS256,
+  IX86_BUILTIN_UNPCKLPS256,
+
+  IX86_BUILTIN_SI256_SI,
+  IX86_BUILTIN_PS256_PS,
+  IX86_BUILTIN_PD256_PD,
+  IX86_BUILTIN_SI_SI256,
+  IX86_BUILTIN_PS_PS256,
+  IX86_BUILTIN_PD_PD256,
+
+  IX86_BUILTIN_VTESTZPD,
+  IX86_BUILTIN_VTESTCPD,
+  IX86_BUILTIN_VTESTNZCPD,
+  IX86_BUILTIN_VTESTZPS,
+  IX86_BUILTIN_VTESTCPS,
+  IX86_BUILTIN_VTESTNZCPS,
+  IX86_BUILTIN_VTESTZPD256,
+  IX86_BUILTIN_VTESTCPD256,
+  IX86_BUILTIN_VTESTNZCPD256,
+  IX86_BUILTIN_VTESTZPS256,
+  IX86_BUILTIN_VTESTCPS256,
+  IX86_BUILTIN_VTESTNZCPS256,
+  IX86_BUILTIN_PTESTZ256,
+  IX86_BUILTIN_PTESTC256,
+  IX86_BUILTIN_PTESTNZC256,
+
+  IX86_BUILTIN_MOVMSKPD256,
+  IX86_BUILTIN_MOVMSKPS256,
+
   /* TFmode support builtins.  */
   IX86_BUILTIN_INFQ,
   IX86_BUILTIN_FABSQ,
@@ -19328,19 +19842,35 @@ enum ix86_special_builtin_type
 {
   SPECIAL_FTYPE_UNKNOWN,
   VOID_FTYPE_VOID,
+  V32QI_FTYPE_PCCHAR,
   V16QI_FTYPE_PCCHAR,
+  V8SF_FTYPE_PCV4SF,
+  V8SF_FTYPE_PCFLOAT,
+  V4DF_FTYPE_PCV2DF,
+  V4DF_FTYPE_PCDOUBLE,
   V4SF_FTYPE_PCFLOAT,
   V2DF_FTYPE_PCDOUBLE,
+  V8SF_FTYPE_PCV8SF_V8SF,
+  V4DF_FTYPE_PCV4DF_V4DF,
   V4SF_FTYPE_V4SF_PCV2SF,
+  V4SF_FTYPE_PCV4SF_V4SF,
   V2DF_FTYPE_V2DF_PCDOUBLE,
+  V2DF_FTYPE_PCV2DF_V2DF,
   V2DI_FTYPE_PV2DI,
   VOID_FTYPE_PV2SF_V4SF,
   VOID_FTYPE_PV2DI_V2DI,
+  VOID_FTYPE_PCHAR_V32QI,
   VOID_FTYPE_PCHAR_V16QI,
+  VOID_FTYPE_PFLOAT_V8SF,
   VOID_FTYPE_PFLOAT_V4SF,
+  VOID_FTYPE_PDOUBLE_V4DF,
   VOID_FTYPE_PDOUBLE_V2DF,
   VOID_FTYPE_PDI_DI,
-  VOID_FTYPE_PINT_INT
+  VOID_FTYPE_PINT_INT,
+  VOID_FTYPE_PV8SF_V8SF_V8SF,
+  VOID_FTYPE_PV4DF_V4DF_V4DF,
+  VOID_FTYPE_PV4SF_V4SF_V4SF,
+  VOID_FTYPE_PV2DF_V2DF_V2DF
 };
 
 /* Builtin types */
@@ -19350,25 +19880,45 @@ enum ix86_builtin_type
   FLOAT128_FTYPE_FLOAT128,
   FLOAT_FTYPE_FLOAT,
   FLOAT128_FTYPE_FLOAT128_FLOAT128,
+  INT_FTYPE_V8SF_V8SF_PTEST,
+  INT_FTYPE_V4DI_V4DI_PTEST,
+  INT_FTYPE_V4DF_V4DF_PTEST,
+  INT_FTYPE_V4SF_V4SF_PTEST,
   INT_FTYPE_V2DI_V2DI_PTEST,
+  INT_FTYPE_V2DF_V2DF_PTEST,
   INT64_FTYPE_V4SF,
   INT64_FTYPE_V2DF,
   INT_FTYPE_V16QI,
   INT_FTYPE_V8QI,
+  INT_FTYPE_V8SF,
+  INT_FTYPE_V4DF,
   INT_FTYPE_V4SF,
   INT_FTYPE_V2DF,
   V16QI_FTYPE_V16QI,
+  V8SI_FTYPE_V8SF,
+  V8SI_FTYPE_V4SI,
   V8HI_FTYPE_V8HI,
   V8HI_FTYPE_V16QI,
   V8QI_FTYPE_V8QI,
+  V8SF_FTYPE_V8SF,
+  V8SF_FTYPE_V8SI,
+  V8SF_FTYPE_V4SF,
   V4SI_FTYPE_V4SI,
   V4SI_FTYPE_V16QI,
+  V4SI_FTYPE_V8SI,
   V4SI_FTYPE_V8HI,
+  V4SI_FTYPE_V4DF,
   V4SI_FTYPE_V4SF,
   V4SI_FTYPE_V2DF,
   V4HI_FTYPE_V4HI,
+  V4DF_FTYPE_V4DF,
+  V4DF_FTYPE_V4SI,
+  V4DF_FTYPE_V4SF,
+  V4DF_FTYPE_V2DF,
+  V4SF_FTYPE_V4DF,
   V4SF_FTYPE_V4SF,
   V4SF_FTYPE_V4SF_VEC_MERGE,
+  V4SF_FTYPE_V8SF,
   V4SF_FTYPE_V4SI,
   V4SF_FTYPE_V2DF,
   V2DI_FTYPE_V2DI,
@@ -19378,6 +19928,7 @@ enum ix86_builtin_type
   V2DF_FTYPE_V2DF,
   V2DF_FTYPE_V2DF_VEC_MERGE,
   V2DF_FTYPE_V4SI,
+  V2DF_FTYPE_V4DF,
   V2DF_FTYPE_V4SF,
   V2DF_FTYPE_V2SI,
   V2SI_FTYPE_V2SI,
@@ -19395,6 +19946,8 @@ enum ix86_builtin_type
   V8HI_FTYPE_V16QI_V16QI,
   V8HI_FTYPE_V4SI_V4SI,
   V8HI_FTYPE_V8HI_SI_COUNT,
+  V8SF_FTYPE_V8SF_V8SF,
+  V8SF_FTYPE_V8SF_V8SI,
   V4SI_FTYPE_V4SI_V4SI,
   V4SI_FTYPE_V4SI_V4SI_COUNT,
   V4SI_FTYPE_V8HI_V8HI,
@@ -19406,8 +19959,11 @@ enum ix86_builtin_type
   V4HI_FTYPE_V8QI_V8QI,
   V4HI_FTYPE_V2SI_V2SI,
   V4HI_FTYPE_V4HI_SI_COUNT,
+  V4DF_FTYPE_V4DF_V4DF,
+  V4DF_FTYPE_V4DF_V4DI,
   V4SF_FTYPE_V4SF_V4SF,
   V4SF_FTYPE_V4SF_V4SF_SWAP,
+  V4SF_FTYPE_V4SF_V4SI,
   V4SF_FTYPE_V4SF_V2SI,
   V4SF_FTYPE_V4SF_V2DF,
   V4SF_FTYPE_V4SF_DI,
@@ -19427,6 +19983,7 @@ enum ix86_builtin_type
   V2DF_FTYPE_V2DF_V2DF,
   V2DF_FTYPE_V2DF_V2DF_SWAP,
   V2DF_FTYPE_V2DF_V4SF,
+  V2DF_FTYPE_V2DF_V2DI,
   V2DF_FTYPE_V2DF_DI,
   V2DF_FTYPE_V2DF_SI,
   V2SF_FTYPE_V2SF_V2SF,
@@ -19442,21 +19999,38 @@ enum ix86_builtin_type
   V8HI_FTYPE_V8HI_INT,
   V4SI_FTYPE_V4SI_INT,
   V4HI_FTYPE_V4HI_INT,
+  V8SF_FTYPE_V8SF_INT,
+  V4SI_FTYPE_V8SI_INT,
+  V4SF_FTYPE_V8SF_INT,
+  V2DF_FTYPE_V4DF_INT,
+  V4DF_FTYPE_V4DF_INT,
   V4SF_FTYPE_V4SF_INT,
   V2DI_FTYPE_V2DI_INT,
   V2DI2TI_FTYPE_V2DI_INT,
   V2DF_FTYPE_V2DF_INT,
   V16QI_FTYPE_V16QI_V16QI_V16QI,
+  V8SF_FTYPE_V8SF_V8SF_V8SF,
+  V4DF_FTYPE_V4DF_V4DF_V4DF,
   V4SF_FTYPE_V4SF_V4SF_V4SF,
   V2DF_FTYPE_V2DF_V2DF_V2DF,
   V16QI_FTYPE_V16QI_V16QI_INT,
+  V8SI_FTYPE_V8SI_V8SI_INT,
+  V8SI_FTYPE_V8SI_V4SI_INT,
   V8HI_FTYPE_V8HI_V8HI_INT,
+  V8SF_FTYPE_V8SF_V8SF_INT,
+  V8SF_FTYPE_V8SF_V4SF_INT,
   V4SI_FTYPE_V4SI_V4SI_INT,
+  V4DF_FTYPE_V4DF_V4DF_INT,
+  V4DF_FTYPE_V4DF_V2DF_INT,
   V4SF_FTYPE_V4SF_V4SF_INT,
   V2DI_FTYPE_V2DI_V2DI_INT,
   V2DI2TI_FTYPE_V2DI_V2DI_INT,
   V1DI2DI_FTYPE_V1DI_V1DI_INT,
   V2DF_FTYPE_V2DF_V2DF_INT,
+  V8SF_FTYPE_V8SF_V8SF_V8SI_INT,
+  V4DF_FTYPE_V4DF_V4DF_V4DI_INT,
+  V4SF_FTYPE_V4SF_V4SF_V4SI_INT,
+  V2DF_FTYPE_V2DF_V2DF_V2DI_INT,
   V2DI_FTYPE_V2DI_UINT_UINT,
   V2DI_FTYPE_V2DI_V2DI_UINT_UINT
 };
@@ -19507,6 +20081,34 @@ static const struct builtin_description bdesc_special_args[] =
   /* SSE4A */
   { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv2df, "__builtin_ia32_movntsd", IX86_BUILTIN_MOVNTSD, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V2DF },
   { OPTION_MASK_ISA_SSE4A, CODE_FOR_sse4a_vmmovntv4sf, "__builtin_ia32_movntss", IX86_BUILTIN_MOVNTSS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF },
+
+  /* AVX */
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroall, "__builtin_ia32_vzeroall", IX86_BUILTIN_VZEROALL, UNKNOWN, (int) VOID_FTYPE_VOID },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vzeroupper, 0, IX86_BUILTIN_VZEROUPPER, UNKNOWN, (int) VOID_FTYPE_VOID },
+  { OPTION_MASK_ISA_AVX | OPTION_MASK_ISA_64BIT, CODE_FOR_avx_vzeroupper_rex64, 0, IX86_BUILTIN_VZEROUPPER_REX64, UNKNOWN, (int) VOID_FTYPE_VOID },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastss, "__builtin_ia32_vbroadcastss", IX86_BUILTIN_VBROADCASTSS, UNKNOWN, (int) V4SF_FTYPE_PCFLOAT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastsd256, "__builtin_ia32_vbroadcastsd256", IX86_BUILTIN_VBROADCASTSD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastss256, "__builtin_ia32_vbroadcastss256", IX86_BUILTIN_VBROADCASTSS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_pd256, "__builtin_ia32_vbroadcastf128_pd256", IX86_BUILTIN_VBROADCASTPD256, UNKNOWN, (int) V4DF_FTYPE_PCV2DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vbroadcastf128_ps256, "__builtin_ia32_vbroadcastf128_ps256", IX86_BUILTIN_VBROADCASTPS256, UNKNOWN, (int) V8SF_FTYPE_PCV4SF },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_loadupd256", IX86_BUILTIN_LOADUPD256, UNKNOWN, (int) V4DF_FTYPE_PCDOUBLE },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_loadups256", IX86_BUILTIN_LOADUPS256, UNKNOWN, (int) V8SF_FTYPE_PCFLOAT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movupd256, "__builtin_ia32_storeupd256", IX86_BUILTIN_STOREUPD256, UNKNOWN, (int) VOID_FTYPE_PDOUBLE_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movups256, "__builtin_ia32_storeups256", IX86_BUILTIN_STOREUPS256, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_loaddqu256", IX86_BUILTIN_LOADDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movdqu256, "__builtin_ia32_storedqu256", IX86_BUILTIN_STOREDQU256, UNKNOWN, (int) VOID_FTYPE_PCHAR_V32QI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_lddqu256, "__builtin_ia32_lddqu256", IX86_BUILTIN_LDDQU256, UNKNOWN, (int) V32QI_FTYPE_PCCHAR },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd, "__builtin_ia32_maskloadpd", IX86_BUILTIN_MASKLOADPD, UNKNOWN, (int) V2DF_FTYPE_PCV2DF_V2DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps, "__builtin_ia32_maskloadps", IX86_BUILTIN_MASKLOADPS, UNKNOWN, (int) V4SF_FTYPE_PCV4SF_V4SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadpd256, "__builtin_ia32_maskloadpd256", IX86_BUILTIN_MASKLOADPD256, UNKNOWN, (int) V4DF_FTYPE_PCV4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskloadps256, "__builtin_ia32_maskloadps256", IX86_BUILTIN_MASKLOADPS256, UNKNOWN, (int) V8SF_FTYPE_PCV8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd, "__builtin_ia32_maskstorepd", IX86_BUILTIN_MASKSTOREPD, UNKNOWN, (int) VOID_FTYPE_PV2DF_V2DF_V2DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps, "__builtin_ia32_maskstoreps", IX86_BUILTIN_MASKSTOREPS, UNKNOWN, (int) VOID_FTYPE_PV4SF_V4SF_V4SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstorepd256, "__builtin_ia32_maskstorepd256", IX86_BUILTIN_MASKSTOREPD256, UNKNOWN, (int) VOID_FTYPE_PV4DF_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_maskstoreps256, "__builtin_ia32_maskstoreps256", IX86_BUILTIN_MASKSTOREPS256, UNKNOWN, (int) VOID_FTYPE_PV8SF_V8SF_V8SF },
 };
 
 /* Builtins with variable number of arguments.  */
@@ -19991,6 +20593,124 @@ static const struct builtin_description bdesc_args[] =
 
   /* PCLMUL */
   { OPTION_MASK_ISA_SSE2, CODE_FOR_pclmulqdq, 0, IX86_BUILTIN_PCLMULQDQ128, UNKNOWN, (int) V2DI_FTYPE_V2DI_V2DI_INT },
+
+  /* AVX */
+  { OPTION_MASK_ISA_AVX, CODE_FOR_addv4df3, "__builtin_ia32_addpd256", IX86_BUILTIN_ADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_addv8sf3, "__builtin_ia32_addps256", IX86_BUILTIN_ADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv4df3, "__builtin_ia32_addsubpd256", IX86_BUILTIN_ADDSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_addsubv8sf3, "__builtin_ia32_addsubps256", IX86_BUILTIN_ADDSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_andv4df3, "__builtin_ia32_andpd256", IX86_BUILTIN_ANDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_andv8sf3, "__builtin_ia32_andps256", IX86_BUILTIN_ANDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_nandv4df3, "__builtin_ia32_andnpd256", IX86_BUILTIN_ANDNPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_nandv8sf3, "__builtin_ia32_andnps256", IX86_BUILTIN_ANDNPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_divv4df3, "__builtin_ia32_divpd256", IX86_BUILTIN_DIVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_divv8sf3, "__builtin_ia32_divps256", IX86_BUILTIN_DIVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv4df3, "__builtin_ia32_haddpd256", IX86_BUILTIN_HADDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv8sf3, "__builtin_ia32_hsubps256", IX86_BUILTIN_HSUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_hsubv4df3, "__builtin_ia32_hsubpd256", IX86_BUILTIN_HSUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_haddv8sf3, "__builtin_ia32_haddps256", IX86_BUILTIN_HADDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv4df3, "__builtin_ia32_maxpd256", IX86_BUILTIN_MAXPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_smaxv8sf3, "__builtin_ia32_maxps256", IX86_BUILTIN_MAXPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_sminv4df3, "__builtin_ia32_minpd256", IX86_BUILTIN_MINPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_sminv8sf3, "__builtin_ia32_minps256", IX86_BUILTIN_MINPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_mulv4df3, "__builtin_ia32_mulpd256", IX86_BUILTIN_MULPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_mulv8sf3, "__builtin_ia32_mulps256", IX86_BUILTIN_MULPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_iorv4df3, "__builtin_ia32_orpd256", IX86_BUILTIN_ORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_iorv8sf3, "__builtin_ia32_orps256", IX86_BUILTIN_ORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_subv4df3, "__builtin_ia32_subpd256", IX86_BUILTIN_SUBPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_subv8sf3, "__builtin_ia32_subps256", IX86_BUILTIN_SUBPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_xorv4df3, "__builtin_ia32_xorpd256", IX86_BUILTIN_XORPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_xorv8sf3, "__builtin_ia32_xorps256", IX86_BUILTIN_XORPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv2df3, "__builtin_ia32_vpermilvarpd", IX86_BUILTIN_VPERMILVARPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4sf3, "__builtin_ia32_vpermilvarps", IX86_BUILTIN_VPERMILVARPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv4df3, "__builtin_ia32_vpermilvarpd256", IX86_BUILTIN_VPERMILVARPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilvarv8sf3, "__builtin_ia32_vpermilvarps256", IX86_BUILTIN_VPERMILVARPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SI },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendpd256, "__builtin_ia32_blendpd256", IX86_BUILTIN_BLENDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendps256, "__builtin_ia32_blendps256", IX86_BUILTIN_BLENDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvpd256, "__builtin_ia32_blendvpd256", IX86_BUILTIN_BLENDVPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_blendvps256, "__builtin_ia32_blendvps256", IX86_BUILTIN_BLENDVPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_dpps256, "__builtin_ia32_dpps256", IX86_BUILTIN_DPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufpd256, "__builtin_ia32_shufpd256", IX86_BUILTIN_SHUFPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_shufps256, "__builtin_ia32_shufps256", IX86_BUILTIN_SHUFPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpsdv2df3, "__builtin_ia32_cmpsd", IX86_BUILTIN_CMPSD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmpssv4sf3, "__builtin_ia32_cmpss", IX86_BUILTIN_CMPSS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmppdv2df3, "__builtin_ia32_cmppd", IX86_BUILTIN_CMPPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmppsv4sf3, "__builtin_ia32_cmpps", IX86_BUILTIN_CMPPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmppdv4df3, "__builtin_ia32_cmppd256", IX86_BUILTIN_CMPPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cmppsv8sf3, "__builtin_ia32_cmpps256", IX86_BUILTIN_CMPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v4df, "__builtin_ia32_vextractf128_pd256", IX86_BUILTIN_EXTRACTF128PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8sf, "__builtin_ia32_vextractf128_ps256", IX86_BUILTIN_EXTRACTF128PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vextractf128v8si, "__builtin_ia32_vextractf128_si256", IX86_BUILTIN_EXTRACTF128SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtdq2pd256, "__builtin_ia32_cvtdq2pd256", IX86_BUILTIN_CVTDQ2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtdq2ps256, "__builtin_ia32_cvtdq2ps256", IX86_BUILTIN_CVTDQ2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2ps256, "__builtin_ia32_cvtpd2ps256", IX86_BUILTIN_CVTPD2PS256, UNKNOWN, (int) V4SF_FTYPE_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2dq256, "__builtin_ia32_cvtps2dq256", IX86_BUILTIN_CVTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtps2pd256, "__builtin_ia32_cvtps2pd256", IX86_BUILTIN_CVTPS2PD256, UNKNOWN, (int) V4DF_FTYPE_V4SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvttpd2dq256, "__builtin_ia32_cvttpd2dq256", IX86_BUILTIN_CVTTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvtpd2dq256, "__builtin_ia32_cvtpd2dq256", IX86_BUILTIN_CVTPD2DQ256, UNKNOWN, (int) V4SI_FTYPE_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_cvttps2dq256, "__builtin_ia32_cvttps2dq256", IX86_BUILTIN_CVTTPS2DQ256, UNKNOWN, (int) V8SI_FTYPE_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v4df3, "__builtin_ia32_vperm2f128_pd256", IX86_BUILTIN_VPERM2F128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8sf3, "__builtin_ia32_vperm2f128_ps256", IX86_BUILTIN_VPERM2F128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vperm2f128v8si3, "__builtin_ia32_vperm2f128_si256", IX86_BUILTIN_VPERM2F128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V8SI_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv2df, "__builtin_ia32_vpermilpd", IX86_BUILTIN_VPERMILPD, UNKNOWN, (int) V2DF_FTYPE_V2DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4sf, "__builtin_ia32_vpermilps", IX86_BUILTIN_VPERMILPS, UNKNOWN, (int) V4SF_FTYPE_V4SF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv4df, "__builtin_ia32_vpermilpd256", IX86_BUILTIN_VPERMILPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermilv8sf, "__builtin_ia32_vpermilps256", IX86_BUILTIN_VPERMILPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermil2v2df3,  "__builtin_ia32_vpermil2pd", IX86_BUILTIN_VPERMIL2PD, UNKNOWN, (int) V2DF_FTYPE_V2DF_V2DF_V2DI_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermil2v4sf3,  "__builtin_ia32_vpermil2ps", IX86_BUILTIN_VPERMIL2PS, UNKNOWN, (int) V4SF_FTYPE_V4SF_V4SF_V4SI_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermil2v4df3,  "__builtin_ia32_vpermil2pd256", IX86_BUILTIN_VPERMIL2PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF_V4DI_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vpermil2v8sf3,  "__builtin_ia32_vpermil2ps256", IX86_BUILTIN_VPERMIL2PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF_V8SI_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v4df, "__builtin_ia32_vinsertf128_pd256", IX86_BUILTIN_VINSERTF128PD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V2DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8sf, "__builtin_ia32_vinsertf128_ps256", IX86_BUILTIN_VINSERTF128PS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V4SF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vinsertf128v8si, "__builtin_ia32_vinsertf128_si256", IX86_BUILTIN_VINSERTF128SI256, UNKNOWN, (int) V8SI_FTYPE_V8SI_V4SI_INT },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movshdup256, "__builtin_ia32_movshdup256", IX86_BUILTIN_MOVSHDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movsldup256, "__builtin_ia32_movsldup256", IX86_BUILTIN_MOVSLDUP256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movddup256, "__builtin_ia32_movddup256", IX86_BUILTIN_MOVDDUP256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv4df2, "__builtin_ia32_sqrtpd256", IX86_BUILTIN_SQRTPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_sqrtv8sf2, "__builtin_ia32_sqrtps256", IX86_BUILTIN_SQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_sqrtv8sf2, "__builtin_ia32_sqrtps_nr256", IX86_BUILTIN_SQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rsqrtv8sf2, "__builtin_ia32_rsqrtps256", IX86_BUILTIN_RSQRTPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_rsqrtv8sf2, "__builtin_ia32_rsqrtps_nr256", IX86_BUILTIN_RSQRTPS_NR256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_rcpv8sf2, "__builtin_ia32_rcpps256", IX86_BUILTIN_RCPPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundpd256, "__builtin_ia32_roundpd256", IX86_BUILTIN_ROUNDPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_INT },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_roundps256, "__builtin_ia32_roundps256", IX86_BUILTIN_ROUNDPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_INT },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhpd256,  "__builtin_ia32_unpckhpd256", IX86_BUILTIN_UNPCKHPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklpd256,  "__builtin_ia32_unpcklpd256", IX86_BUILTIN_UNPCKLPD256, UNKNOWN, (int) V4DF_FTYPE_V4DF_V4DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpckhps256,  "__builtin_ia32_unpckhps256", IX86_BUILTIN_UNPCKHPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_unpcklps256,  "__builtin_ia32_unpcklps256", IX86_BUILTIN_UNPCKLPS256, UNKNOWN, (int) V8SF_FTYPE_V8SF_V8SF },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si256_si, "__builtin_ia32_si256_si", IX86_BUILTIN_SI256_SI, UNKNOWN, (int) V8SI_FTYPE_V4SI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps256_ps, "__builtin_ia32_ps256_ps", IX86_BUILTIN_PS256_PS, UNKNOWN, (int) V8SF_FTYPE_V4SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd256_pd, "__builtin_ia32_pd256_pd", IX86_BUILTIN_PD256_PD, UNKNOWN, (int) V4DF_FTYPE_V2DF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_si_si256, "__builtin_ia32_si_si256", IX86_BUILTIN_SI_SI256, UNKNOWN, (int) V4SI_FTYPE_V8SI },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ps_ps256, "__builtin_ia32_ps_ps256", IX86_BUILTIN_PS_PS256, UNKNOWN, (int) V4SF_FTYPE_V8SF },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_pd_pd256, "__builtin_ia32_pd_pd256", IX86_BUILTIN_PD_PD256, UNKNOWN, (int) V2DF_FTYPE_V4DF },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestzpd", IX86_BUILTIN_VTESTZPD, EQ, (int) INT_FTYPE_V2DF_V2DF_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestcpd", IX86_BUILTIN_VTESTCPD, LTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd, "__builtin_ia32_vtestnzcpd", IX86_BUILTIN_VTESTNZCPD, GTU, (int) INT_FTYPE_V2DF_V2DF_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestzps", IX86_BUILTIN_VTESTZPS, EQ, (int) INT_FTYPE_V4SF_V4SF_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestcps", IX86_BUILTIN_VTESTCPS, LTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps, "__builtin_ia32_vtestnzcps", IX86_BUILTIN_VTESTNZCPS, GTU, (int) INT_FTYPE_V4SF_V4SF_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestzpd256", IX86_BUILTIN_VTESTZPD256, EQ, (int) INT_FTYPE_V4DF_V4DF_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestcpd256", IX86_BUILTIN_VTESTCPD256, LTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestpd256, "__builtin_ia32_vtestnzcpd256", IX86_BUILTIN_VTESTNZCPD256, GTU, (int) INT_FTYPE_V4DF_V4DF_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestzps256", IX86_BUILTIN_VTESTZPS256, EQ, (int) INT_FTYPE_V8SF_V8SF_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestcps256", IX86_BUILTIN_VTESTCPS256, LTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_vtestps256, "__builtin_ia32_vtestnzcps256", IX86_BUILTIN_VTESTNZCPS256, GTU, (int) INT_FTYPE_V8SF_V8SF_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestz256", IX86_BUILTIN_PTESTZ256, EQ, (int) INT_FTYPE_V4DI_V4DI_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestc256", IX86_BUILTIN_PTESTC256, LTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_ptest256, "__builtin_ia32_ptestnzc256", IX86_BUILTIN_PTESTNZC256, GTU, (int) INT_FTYPE_V4DI_V4DI_PTEST },
+
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskpd256, "__builtin_ia32_movmskpd256", IX86_BUILTIN_MOVMSKPD256, UNKNOWN, (int) INT_FTYPE_V4DF  },
+  { OPTION_MASK_ISA_AVX, CODE_FOR_avx_movmskps256, "__builtin_ia32_movmskps256", IX86_BUILTIN_MOVMSKPS256, UNKNOWN, (int) INT_FTYPE_V8SF },
 };
 
 /* SSE5 */
@@ -20787,6 +21507,276 @@ ix86_init_mmx_sse_builtins (void)
 				float_type_node,
 				NULL_TREE);
 
+  /* AVX builtins  */
+  tree V32QI_type_node = build_vector_type_for_mode (char_type_node,
+						     V32QImode);
+  tree V8SI_type_node = build_vector_type_for_mode (intSI_type_node,
+						    V8SImode);
+  tree V8SF_type_node = build_vector_type_for_mode (float_type_node,
+						    V8SFmode);
+  tree V4DI_type_node = build_vector_type_for_mode (long_long_integer_type_node,
+						    V4DImode);
+  tree V4DF_type_node = build_vector_type_for_mode (double_type_node,
+						    V4DFmode);
+  tree v8sf_ftype_v8sf
+    = build_function_type_list (V8SF_type_node,
+				V8SF_type_node,
+				NULL_TREE);
+  tree v8si_ftype_v8sf
+    = build_function_type_list (V8SI_type_node,
+				V8SF_type_node,
+				NULL_TREE);
+  tree v8sf_ftype_v8si
+    = build_function_type_list (V8SF_type_node,
+				V8SI_type_node,
+				NULL_TREE);
+  tree v4si_ftype_v4df
+    = build_function_type_list (V4SI_type_node,
+				V4DF_type_node,
+				NULL_TREE);
+  tree v4df_ftype_v4df
+    = build_function_type_list (V4DF_type_node,
+				V4DF_type_node,
+				NULL_TREE);
+  tree v4df_ftype_v4si
+    = build_function_type_list (V4DF_type_node,
+				V4SI_type_node,
+				NULL_TREE);
+  tree v4df_ftype_v4sf
+    = build_function_type_list (V4DF_type_node,
+				V4SF_type_node,
+				NULL_TREE);
+  tree v4sf_ftype_v4df
+    = build_function_type_list (V4SF_type_node,
+				V4DF_type_node,
+				NULL_TREE);
+  tree v8sf_ftype_v8sf_v8sf
+    = build_function_type_list (V8SF_type_node,
+				V8SF_type_node, V8SF_type_node,
+				NULL_TREE);
+  tree v4df_ftype_v4df_v4df
+    = build_function_type_list (V4DF_type_node,
+				V4DF_type_node, V4DF_type_node,
+				NULL_TREE);
+  tree v8sf_ftype_v8sf_int
+    = build_function_type_list (V8SF_type_node,
+				V8SF_type_node, integer_type_node,
+				NULL_TREE);
+  tree v4si_ftype_v8si_int
+    = build_function_type_list (V4SI_type_node,
+				V8SI_type_node, integer_type_node,
+				NULL_TREE);
+  tree v4df_ftype_v4df_int
+    = build_function_type_list (V4DF_type_node,
+				V4DF_type_node, integer_type_node,
+				NULL_TREE);
+  tree v4sf_ftype_v8sf_int
+    = build_function_type_list (V4SF_type_node,
+				V8SF_type_node, integer_type_node,
+				NULL_TREE);
+  tree v2df_ftype_v4df_int
+    = build_function_type_list (V2DF_type_node,
+				V4DF_type_node, integer_type_node,
+				NULL_TREE);
+  tree v8sf_ftype_v8sf_v8sf_int
+    = build_function_type_list (V8SF_type_node,
+				V8SF_type_node, V8SF_type_node,
+				integer_type_node,
+				NULL_TREE);
+  tree v8sf_ftype_v8sf_v8sf_v8sf
+    = build_function_type_list (V8SF_type_node,
+				V8SF_type_node, V8SF_type_node,
+				V8SF_type_node,
+				NULL_TREE);
+  tree v4df_ftype_v4df_v4df_v4df
+    = build_function_type_list (V4DF_type_node,
+				V4DF_type_node, V4DF_type_node,
+				V4DF_type_node,
+				NULL_TREE);
+  tree v8si_ftype_v8si_v8si_int
+    = build_function_type_list (V8SI_type_node,
+				V8SI_type_node, V8SI_type_node,
+				integer_type_node,
+				NULL_TREE);
+  tree v4df_ftype_v4df_v4df_int
+    = build_function_type_list (V4DF_type_node,
+				V4DF_type_node, V4DF_type_node,
+				integer_type_node,
+				NULL_TREE);
+  tree v8sf_ftype_v8sf_v8sf_v8si_int
+    = build_function_type_list (V8SF_type_node,
+				V8SF_type_node, V8SF_type_node,
+				V8SI_type_node, integer_type_node,
+				NULL_TREE);
+  tree v4df_ftype_v4df_v4df_v4di_int
+    = build_function_type_list (V4DF_type_node,
+				V4DF_type_node, V4DF_type_node,
+				V4DI_type_node, integer_type_node,
+				NULL_TREE);
+  tree v4sf_ftype_v4sf_v4sf_v4si_int
+    = build_function_type_list (V4SF_type_node,
+				V4SF_type_node, V4SF_type_node,
+				V4SI_type_node, integer_type_node,
+				NULL_TREE);
+  tree v2df_ftype_v2df_v2df_v2di_int
+    = build_function_type_list (V2DF_type_node,
+				V2DF_type_node, V2DF_type_node,
+				V2DI_type_node, integer_type_node,
+				NULL_TREE);
+  tree v8sf_ftype_pcfloat
+    = build_function_type_list (V8SF_type_node,
+				pcfloat_type_node,
+				NULL_TREE);
+  tree v4df_ftype_pcdouble
+    = build_function_type_list (V4DF_type_node,
+				pcdouble_type_node,
+				NULL_TREE);
+  tree pcv4sf_type_node
+    = build_pointer_type (build_type_variant (V4SF_type_node, 1, 0));
+  tree pcv2df_type_node
+    = build_pointer_type (build_type_variant (V2DF_type_node, 1, 0));
+  tree v8sf_ftype_pcv4sf
+    = build_function_type_list (V8SF_type_node,
+				pcv4sf_type_node,
+				NULL_TREE);
+  tree v4df_ftype_pcv2df
+    = build_function_type_list (V4DF_type_node,
+				pcv2df_type_node,
+				NULL_TREE);
+  tree v32qi_ftype_pcchar
+    = build_function_type_list (V32QI_type_node,
+				pcchar_type_node,
+				NULL_TREE);
+  tree void_ftype_pchar_v32qi
+    = build_function_type_list (void_type_node,
+			        pchar_type_node, V32QI_type_node,
+				NULL_TREE);
+  tree v8si_ftype_v8si_v4si_int
+    = build_function_type_list (V8SI_type_node,
+				V8SI_type_node, V4SI_type_node,
+				integer_type_node,
+				NULL_TREE);
+  tree v8sf_ftype_v8sf_v4sf_int
+    = build_function_type_list (V8SF_type_node,
+				V8SF_type_node, V4SF_type_node,
+				integer_type_node,
+				NULL_TREE);
+  tree v4df_ftype_v4df_v2df_int
+    = build_function_type_list (V4DF_type_node,
+				V4DF_type_node, V2DF_type_node,
+				integer_type_node,
+				NULL_TREE);
+  tree void_ftype_pfloat_v8sf
+    = build_function_type_list (void_type_node,
+			        pfloat_type_node, V8SF_type_node,
+				NULL_TREE);
+  tree void_ftype_pdouble_v4df
+    = build_function_type_list (void_type_node,
+			        pdouble_type_node, V4DF_type_node,
+				NULL_TREE);
+  tree pv8sf_type_node = build_pointer_type (V8SF_type_node);
+  tree pv4sf_type_node = build_pointer_type (V4SF_type_node);
+  tree pv4df_type_node = build_pointer_type (V4DF_type_node);
+  tree pv2df_type_node = build_pointer_type (V2DF_type_node);
+  tree pcv8sf_type_node
+    = build_pointer_type (build_type_variant (V8SF_type_node, 1, 0));
+  tree pcv4df_type_node
+    = build_pointer_type (build_type_variant (V4DF_type_node, 1, 0));
+  tree v8sf_ftype_pcv8sf_v8sf
+    = build_function_type_list (V8SF_type_node,
+				pcv8sf_type_node, V8SF_type_node,
+				NULL_TREE);
+  tree v4df_ftype_pcv4df_v4df
+    = build_function_type_list (V4DF_type_node,
+				pcv4df_type_node, V4DF_type_node,
+				NULL_TREE);
+  tree v4sf_ftype_pcv4sf_v4sf
+    = build_function_type_list (V4SF_type_node,
+				pcv4sf_type_node, V4SF_type_node,
+				NULL_TREE);
+  tree v2df_ftype_pcv2df_v2df
+    = build_function_type_list (V2DF_type_node,
+				pcv2df_type_node, V2DF_type_node,
+				NULL_TREE);
+  tree void_ftype_pv8sf_v8sf_v8sf
+    = build_function_type_list (void_type_node,
+			        pv8sf_type_node, V8SF_type_node,
+				V8SF_type_node,
+				NULL_TREE);
+  tree void_ftype_pv4df_v4df_v4df
+    = build_function_type_list (void_type_node,
+			        pv4df_type_node, V4DF_type_node,
+				V4DF_type_node,
+				NULL_TREE);
+  tree void_ftype_pv4sf_v4sf_v4sf
+    = build_function_type_list (void_type_node,
+			        pv4sf_type_node, V4SF_type_node,
+				V4SF_type_node,
+				NULL_TREE);
+  tree void_ftype_pv2df_v2df_v2df
+    = build_function_type_list (void_type_node,
+			        pv2df_type_node, V2DF_type_node,
+				V2DF_type_node,
+				NULL_TREE);
+  tree v4df_ftype_v2df
+    = build_function_type_list (V4DF_type_node,
+				V2DF_type_node,
+				NULL_TREE);
+  tree v8sf_ftype_v4sf
+    = build_function_type_list (V8SF_type_node,
+				V4SF_type_node,
+				NULL_TREE);
+  tree v8si_ftype_v4si
+    = build_function_type_list (V8SI_type_node,
+				V4SI_type_node,
+				NULL_TREE);
+  tree v2df_ftype_v4df
+    = build_function_type_list (V2DF_type_node,
+				V4DF_type_node,
+				NULL_TREE);
+  tree v4sf_ftype_v8sf
+    = build_function_type_list (V4SF_type_node,
+				V8SF_type_node,
+				NULL_TREE);
+  tree v4si_ftype_v8si
+    = build_function_type_list (V4SI_type_node,
+				V8SI_type_node,
+				NULL_TREE);
+  tree int_ftype_v4df
+    = build_function_type_list (integer_type_node,
+				V4DF_type_node,
+				NULL_TREE);
+  tree int_ftype_v8sf
+    = build_function_type_list (integer_type_node,
+				V8SF_type_node,
+				NULL_TREE);
+  tree int_ftype_v8sf_v8sf
+    = build_function_type_list (integer_type_node,
+				V8SF_type_node, V8SF_type_node,
+				NULL_TREE);
+  tree int_ftype_v4di_v4di
+    = build_function_type_list (integer_type_node,
+				V4DI_type_node, V4DI_type_node,
+				NULL_TREE);
+  tree int_ftype_v4df_v4df
+    = build_function_type_list (integer_type_node,
+				V4DF_type_node, V4DF_type_node,
+				NULL_TREE);
+  tree v8sf_ftype_v8sf_v8si
+    = build_function_type_list (V8SF_type_node,
+				V8SF_type_node, V8SI_type_node,
+				NULL_TREE);
+  tree v4df_ftype_v4df_v4di
+    = build_function_type_list (V4DF_type_node,
+				V4DF_type_node, V4DI_type_node,
+				NULL_TREE);
+  tree v4sf_ftype_v4sf_v4si
+    = build_function_type_list (V4SF_type_node,
+				V4SF_type_node, V4SI_type_node, NULL_TREE);
+  tree v2df_ftype_v2df_v2di
+    = build_function_type_list (V2DF_type_node,
+				V2DF_type_node, V2DI_type_node, NULL_TREE);
+
   tree ftype;
 
   /* Add all special builtins with variable number of operands.  */
@@ -20804,9 +21794,24 @@ ix86_init_mmx_sse_builtins (void)
 	case VOID_FTYPE_VOID:
 	  type = void_ftype_void;
 	  break;
+	case V32QI_FTYPE_PCCHAR:
+	  type = v32qi_ftype_pcchar;
+	  break;
 	case V16QI_FTYPE_PCCHAR:
 	  type = v16qi_ftype_pcchar;
 	  break;
+	case V8SF_FTYPE_PCV4SF:
+	  type = v8sf_ftype_pcv4sf;
+	  break;
+	case V8SF_FTYPE_PCFLOAT:
+	  type = v8sf_ftype_pcfloat;
+	  break;
+	case V4DF_FTYPE_PCV2DF:
+	  type = v4df_ftype_pcv2df;
+	  break;
+	case V4DF_FTYPE_PCDOUBLE:
+	  type = v4df_ftype_pcdouble;
+	  break;
 	case V4SF_FTYPE_PCFLOAT:
 	  type = v4sf_ftype_pcfloat;
 	  break;
@@ -20816,24 +21821,45 @@ ix86_init_mmx_sse_builtins (void)
 	case V2DF_FTYPE_PCDOUBLE:
 	  type = v2df_ftype_pcdouble;
 	  break;
+	case V8SF_FTYPE_PCV8SF_V8SF:
+	  type = v8sf_ftype_pcv8sf_v8sf;
+	  break;
+	case V4DF_FTYPE_PCV4DF_V4DF:
+	  type = v4df_ftype_pcv4df_v4df;
+	  break;
 	case V4SF_FTYPE_V4SF_PCV2SF:
 	  type = v4sf_ftype_v4sf_pcv2sf;
 	  break;
+	case V4SF_FTYPE_PCV4SF_V4SF:
+	  type = v4sf_ftype_pcv4sf_v4sf;
+	  break;
 	case V2DF_FTYPE_V2DF_PCDOUBLE:
 	  type = v2df_ftype_v2df_pcdouble;
 	  break;
+	case V2DF_FTYPE_PCV2DF_V2DF:
+	  type = v2df_ftype_pcv2df_v2df;
+	  break;
 	case VOID_FTYPE_PV2SF_V4SF:
 	  type = void_ftype_pv2sf_v4sf;
 	  break;
 	case VOID_FTYPE_PV2DI_V2DI:
 	  type = void_ftype_pv2di_v2di;
 	  break;
+	case VOID_FTYPE_PCHAR_V32QI:
+	  type = void_ftype_pchar_v32qi;
+	  break;
 	case VOID_FTYPE_PCHAR_V16QI:
 	  type = void_ftype_pchar_v16qi;
 	  break;
+	case VOID_FTYPE_PFLOAT_V8SF:
+	  type = void_ftype_pfloat_v8sf;
+	  break;
 	case VOID_FTYPE_PFLOAT_V4SF:
 	  type = void_ftype_pfloat_v4sf;
 	  break;
+	case VOID_FTYPE_PDOUBLE_V4DF:
+	  type = void_ftype_pdouble_v4df;
+	  break;
 	case VOID_FTYPE_PDOUBLE_V2DF:
 	  type = void_ftype_pdouble_v2df;
 	  break;
@@ -20843,6 +21869,18 @@ ix86_init_mmx_sse_builtins (void)
 	case VOID_FTYPE_PINT_INT:
 	  type = void_ftype_pint_int;
 	  break;
+	case VOID_FTYPE_PV8SF_V8SF_V8SF:
+	  type = void_ftype_pv8sf_v8sf_v8sf;
+	  break;
+	case VOID_FTYPE_PV4DF_V4DF_V4DF:
+	  type = void_ftype_pv4df_v4df_v4df;
+	  break;
+	case VOID_FTYPE_PV4SF_V4SF_V4SF:
+	  type = void_ftype_pv4sf_v4sf_v4sf;
+	  break;
+	case VOID_FTYPE_PV2DF_V2DF_V2DF:
+	  type = void_ftype_pv2df_v2df_v2df;
+	  break;
 	default:
 	  gcc_unreachable ();
 	}
@@ -20865,9 +21903,24 @@ ix86_init_mmx_sse_builtins (void)
 	case FLOAT_FTYPE_FLOAT:
 	  type = float_ftype_float;
 	  break;
+	case INT_FTYPE_V8SF_V8SF_PTEST:
+	  type = int_ftype_v8sf_v8sf;
+	  break;
+	case INT_FTYPE_V4DI_V4DI_PTEST:
+	  type = int_ftype_v4di_v4di;
+	  break;
+	case INT_FTYPE_V4DF_V4DF_PTEST:
+	  type = int_ftype_v4df_v4df;
+	  break;
+	case INT_FTYPE_V4SF_V4SF_PTEST:
+	  type = int_ftype_v4sf_v4sf;
+	  break;
 	case INT_FTYPE_V2DI_V2DI_PTEST:
 	  type = int_ftype_v2di_v2di;
 	  break;
+	case INT_FTYPE_V2DF_V2DF_PTEST:
+	  type = int_ftype_v2df_v2df;
+	  break;
 	case INT64_FTYPE_V4SF:
 	  type = int64_ftype_v4sf;
 	  break;
@@ -20880,6 +21933,12 @@ ix86_init_mmx_sse_builtins (void)
 	case INT_FTYPE_V8QI:
 	  type = int_ftype_v8qi;
 	  break;
+	case INT_FTYPE_V8SF:
+	  type = int_ftype_v8sf;
+	  break;
+	case INT_FTYPE_V4DF:
+	  type = int_ftype_v4df;
+	  break;
 	case INT_FTYPE_V4SF:
 	  type = int_ftype_v4sf;
 	  break;
@@ -20889,6 +21948,12 @@ ix86_init_mmx_sse_builtins (void)
 	case V16QI_FTYPE_V16QI:
 	  type = v16qi_ftype_v16qi;
 	  break;
+	case V8SI_FTYPE_V8SF:
+	  type = v8si_ftype_v8sf;
+	  break;
+	case V8SI_FTYPE_V4SI:
+	  type = v8si_ftype_v4si;
+	  break;
 	case V8HI_FTYPE_V8HI:
 	  type = v8hi_ftype_v8hi;
 	  break;
@@ -20898,12 +21963,27 @@ ix86_init_mmx_sse_builtins (void)
 	case V8QI_FTYPE_V8QI:
 	  type = v8qi_ftype_v8qi;
 	  break;
+	case V8SF_FTYPE_V8SF:
+	  type = v8sf_ftype_v8sf;
+	  break;
+	case V8SF_FTYPE_V8SI:
+	  type = v8sf_ftype_v8si;
+	  break;
+	case V8SF_FTYPE_V4SF:
+	  type = v8sf_ftype_v4sf;
+	  break;
+	case V4SI_FTYPE_V4DF:
+	  type = v4si_ftype_v4df;
+	  break;
 	case V4SI_FTYPE_V4SI:
 	  type = v4si_ftype_v4si;
 	  break;
 	case V4SI_FTYPE_V16QI:
 	  type = v4si_ftype_v16qi;
 	  break;
+	case V4SI_FTYPE_V8SI:
+	  type = v4si_ftype_v8si;
+	  break;
 	case V4SI_FTYPE_V8HI:
 	  type = v4si_ftype_v8hi;
 	  break;
@@ -20916,13 +21996,31 @@ ix86_init_mmx_sse_builtins (void)
 	case V4HI_FTYPE_V4HI:
 	  type = v4hi_ftype_v4hi;
 	  break;
+	case V4DF_FTYPE_V4DF:
+	  type = v4df_ftype_v4df;
+	  break;
+	case V4DF_FTYPE_V4SI:
+	  type = v4df_ftype_v4si;
+	  break;
+	case V4DF_FTYPE_V4SF:
+	  type = v4df_ftype_v4sf;
+	  break;
+	case V4DF_FTYPE_V2DF:
+	  type = v4df_ftype_v2df;
+	  break;
 	case V4SF_FTYPE_V4SF:
 	case V4SF_FTYPE_V4SF_VEC_MERGE:
 	  type = v4sf_ftype_v4sf;
 	  break;
+	case V4SF_FTYPE_V8SF:
+	  type = v4sf_ftype_v8sf;
+	  break;
 	case V4SF_FTYPE_V4SI:
 	  type = v4sf_ftype_v4si;
 	  break;
+	case V4SF_FTYPE_V4DF:
+	  type = v4sf_ftype_v4df;
+	  break;
 	case V4SF_FTYPE_V2DF:
 	  type = v4sf_ftype_v2df;
 	  break;
@@ -20950,6 +22048,9 @@ ix86_init_mmx_sse_builtins (void)
 	case V2SI_FTYPE_V2SF:
 	  type = v2si_ftype_v2sf;
 	  break;
+	case V2DF_FTYPE_V4DF:
+	  type = v2df_ftype_v4df;
+	  break;
 	case V2DF_FTYPE_V4SF:
 	  type = v2df_ftype_v4sf;
 	  break;
@@ -20994,6 +22095,12 @@ ix86_init_mmx_sse_builtins (void)
 	case V8HI_FTYPE_V8HI_SI_COUNT:
 	  type = v8hi_ftype_v8hi_int;
 	  break;
+	case V8SF_FTYPE_V8SF_V8SF:
+	  type = v8sf_ftype_v8sf_v8sf;
+	  break;
+	case V8SF_FTYPE_V8SF_V8SI:
+	  type = v8sf_ftype_v8sf_v8si;
+	  break;
 	case V4SI_FTYPE_V4SI_V4SI:
 	case V4SI_FTYPE_V4SI_V4SI_COUNT:
 	  type = v4si_ftype_v4si_v4si;
@@ -21023,10 +22130,19 @@ ix86_init_mmx_sse_builtins (void)
 	case V4HI_FTYPE_V4HI_SI_COUNT:
 	  type = v4hi_ftype_v4hi_int;
 	  break;
+	case V4DF_FTYPE_V4DF_V4DF:
+	  type = v4df_ftype_v4df_v4df;
+	  break;
+	case V4DF_FTYPE_V4DF_V4DI:
+	  type = v4df_ftype_v4df_v4di;
+	  break;
 	case V4SF_FTYPE_V4SF_V4SF:
 	case V4SF_FTYPE_V4SF_V4SF_SWAP:
 	  type = v4sf_ftype_v4sf_v4sf;
 	  break;
+	case V4SF_FTYPE_V4SF_V4SI:
+	  type = v4sf_ftype_v4sf_v4si;
+	  break;
 	case V4SF_FTYPE_V4SF_V2SI:
 	  type = v4sf_ftype_v4sf_v2si;
 	  break;
@@ -21078,6 +22194,9 @@ ix86_init_mmx_sse_builtins (void)
 	case V2DF_FTYPE_V2DF_V4SF:
 	  type = v2df_ftype_v2df_v4sf;
 	  break;
+	case V2DF_FTYPE_V2DF_V2DI:
+	  type = v2df_ftype_v2df_v2di;
+	  break;
 	case V2DF_FTYPE_V2DF_DI:
 	  type = v2df_ftype_v2df_int64;
 	  break;
@@ -21115,15 +22234,27 @@ ix86_init_mmx_sse_builtins (void)
 	case V8HI_FTYPE_V8HI_INT:
 	  type = v8hi_ftype_v8hi_int;
 	  break;
+	case V8SF_FTYPE_V8SF_INT:
+	  type = v8sf_ftype_v8sf_int;
+	  break;
 	case V4SI_FTYPE_V4SI_INT:
 	  type = v4si_ftype_v4si_int;
 	  break;
+	case V4SI_FTYPE_V8SI_INT:
+	  type = v4si_ftype_v8si_int;
+	  break;
 	case V4HI_FTYPE_V4HI_INT:
 	  type = v4hi_ftype_v4hi_int;
 	  break;
+	case V4DF_FTYPE_V4DF_INT:
+	  type = v4df_ftype_v4df_int;
+	  break;
 	case V4SF_FTYPE_V4SF_INT:
 	  type = v4sf_ftype_v4sf_int;
 	  break;
+	case V4SF_FTYPE_V8SF_INT:
+	  type = v4sf_ftype_v8sf_int;
+	  break;
 	case V2DI_FTYPE_V2DI_INT:
 	case V2DI2TI_FTYPE_V2DI_INT:
 	  type = v2di_ftype_v2di_int;
@@ -21131,9 +22262,18 @@ ix86_init_mmx_sse_builtins (void)
 	case V2DF_FTYPE_V2DF_INT:
 	  type = v2df_ftype_v2df_int;
 	  break;
+	case V2DF_FTYPE_V4DF_INT:
+	  type = v2df_ftype_v4df_int;
+	  break;
 	case V16QI_FTYPE_V16QI_V16QI_V16QI:
 	  type = v16qi_ftype_v16qi_v16qi_v16qi;
 	  break;
+	case V8SF_FTYPE_V8SF_V8SF_V8SF:
+	  type = v8sf_ftype_v8sf_v8sf_v8sf;
+	  break;
+	case V4DF_FTYPE_V4DF_V4DF_V4DF:
+	  type = v4df_ftype_v4df_v4df_v4df;
+	  break;
 	case V4SF_FTYPE_V4SF_V4SF_V4SF:
 	  type = v4sf_ftype_v4sf_v4sf_v4sf;
 	  break;
@@ -21143,12 +22283,30 @@ ix86_init_mmx_sse_builtins (void)
 	case V16QI_FTYPE_V16QI_V16QI_INT:
 	  type = v16qi_ftype_v16qi_v16qi_int;
 	  break;
+	case V8SI_FTYPE_V8SI_V8SI_INT:
+	  type = v8si_ftype_v8si_v8si_int;
+	  break;
+	case V8SI_FTYPE_V8SI_V4SI_INT:
+	  type = v8si_ftype_v8si_v4si_int;
+	  break;
 	case V8HI_FTYPE_V8HI_V8HI_INT:
 	  type = v8hi_ftype_v8hi_v8hi_int;
 	  break;
+	case V8SF_FTYPE_V8SF_V8SF_INT:
+	  type = v8sf_ftype_v8sf_v8sf_int;
+	  break;
+	case V8SF_FTYPE_V8SF_V4SF_INT:
+	  type = v8sf_ftype_v8sf_v4sf_int;
+	  break;
 	case V4SI_FTYPE_V4SI_V4SI_INT:
 	  type = v4si_ftype_v4si_v4si_int;
 	  break;
+	case V4DF_FTYPE_V4DF_V4DF_INT:
+	  type = v4df_ftype_v4df_v4df_int;
+	  break;
+	case V4DF_FTYPE_V4DF_V2DF_INT:
+	  type = v4df_ftype_v4df_v2df_int;
+	  break;
 	case V4SF_FTYPE_V4SF_V4SF_INT:
 	  type = v4sf_ftype_v4sf_v4sf_int;
 	  break;
@@ -21168,6 +22326,18 @@ ix86_init_mmx_sse_builtins (void)
 	case V1DI2DI_FTYPE_V1DI_V1DI_INT:
 	  type = v1di_ftype_v1di_v1di_int;
 	  break;
+	case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
+	  type = v8sf_ftype_v8sf_v8sf_v8si_int;
+	  break;
+	case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
+	  type = v4df_ftype_v4df_v4df_v4di_int;
+	  break;
+	case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
+	  type = v4sf_ftype_v4sf_v4sf_v4si_int;
+	  break;
+	case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
+	  type = v2df_ftype_v2df_v2df_v2di_int;
+	  break;
 	default:
 	  gcc_unreachable ();
 	}
@@ -21234,6 +22404,10 @@ ix86_init_mmx_sse_builtins (void)
   /* PCLMUL */
   def_builtin_const (OPTION_MASK_ISA_PCLMUL, "__builtin_ia32_pclmulqdq128", v2di_ftype_v2di_v2di_int, IX86_BUILTIN_PCLMULQDQ128);
 
+  /* AVX */
+  def_builtin (OPTION_MASK_ISA_AVX, "__builtin_ia32_vzeroupper", void_ftype_void,
+	       TARGET_64BIT ? IX86_BUILTIN_VZEROUPPER_REX64 : IX86_BUILTIN_VZEROUPPER);
+
   /* Access to the vec_init patterns.  */
   ftype = build_function_type_list (V2SI_type_node, integer_type_node,
 				    integer_type_node, NULL_TREE);
@@ -22134,7 +23308,12 @@ ix86_expand_args_builtin (const struct builtin_description *d,
 
   switch ((enum ix86_builtin_type) d->flag)
     {
+    case INT_FTYPE_V8SF_V8SF_PTEST:
+    case INT_FTYPE_V4DI_V4DI_PTEST:
+    case INT_FTYPE_V4DF_V4DF_PTEST:
+    case INT_FTYPE_V4SF_V4SF_PTEST:
     case INT_FTYPE_V2DI_V2DI_PTEST:
+    case INT_FTYPE_V2DF_V2DF_PTEST:
       return ix86_expand_sse_ptest (d, exp, target);
     case FLOAT128_FTYPE_FLOAT128:
     case FLOAT_FTYPE_FLOAT:
@@ -22142,20 +23321,35 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case INT64_FTYPE_V2DF:
     case INT_FTYPE_V16QI:
     case INT_FTYPE_V8QI:
+    case INT_FTYPE_V8SF:
+    case INT_FTYPE_V4DF:
     case INT_FTYPE_V4SF:
     case INT_FTYPE_V2DF:
     case V16QI_FTYPE_V16QI:
+    case V8SI_FTYPE_V8SF:
+    case V8SI_FTYPE_V4SI:
     case V8HI_FTYPE_V8HI:
     case V8HI_FTYPE_V16QI:
     case V8QI_FTYPE_V8QI:
+    case V8SF_FTYPE_V8SF:
+    case V8SF_FTYPE_V8SI:
+    case V8SF_FTYPE_V4SF:
     case V4SI_FTYPE_V4SI:
     case V4SI_FTYPE_V16QI:
     case V4SI_FTYPE_V4SF:
+    case V4SI_FTYPE_V8SI:
     case V4SI_FTYPE_V8HI:
+    case V4SI_FTYPE_V4DF:
     case V4SI_FTYPE_V2DF:
     case V4HI_FTYPE_V4HI:
+    case V4DF_FTYPE_V4DF:
+    case V4DF_FTYPE_V4SI:
+    case V4DF_FTYPE_V4SF:
+    case V4DF_FTYPE_V2DF:
     case V4SF_FTYPE_V4SF:
     case V4SF_FTYPE_V4SI:
+    case V4SF_FTYPE_V8SF:
+    case V4SF_FTYPE_V4DF:
     case V4SF_FTYPE_V2DF:
     case V2DI_FTYPE_V2DI:
     case V2DI_FTYPE_V16QI:
@@ -22163,6 +23357,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V2DI_FTYPE_V4SI:
     case V2DF_FTYPE_V2DF:
     case V2DF_FTYPE_V4SI:
+    case V2DF_FTYPE_V4DF:
     case V2DF_FTYPE_V4SF:
     case V2DF_FTYPE_V2SI:
     case V2SI_FTYPE_V2SI:
@@ -22184,6 +23379,8 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V8HI_FTYPE_V8HI_V8HI:
     case V8HI_FTYPE_V16QI_V16QI:
     case V8HI_FTYPE_V4SI_V4SI:
+    case V8SF_FTYPE_V8SF_V8SF:
+    case V8SF_FTYPE_V8SF_V8SI:
     case V4SI_FTYPE_V4SI_V4SI:
     case V4SI_FTYPE_V8HI_V8HI:
     case V4SI_FTYPE_V4SF_V4SF:
@@ -22191,7 +23388,10 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V4HI_FTYPE_V4HI_V4HI:
     case V4HI_FTYPE_V8QI_V8QI:
     case V4HI_FTYPE_V2SI_V2SI:
+    case V4DF_FTYPE_V4DF_V4DF:
+    case V4DF_FTYPE_V4DF_V4DI:
     case V4SF_FTYPE_V4SF_V4SF:
+    case V4SF_FTYPE_V4SF_V4SI:
     case V4SF_FTYPE_V4SF_V2SI:
     case V4SF_FTYPE_V4SF_V2DF:
     case V4SF_FTYPE_V4SF_DI:
@@ -22206,6 +23406,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V2SI_FTYPE_V2SF_V2SF:
     case V2DF_FTYPE_V2DF_V2DF:
     case V2DF_FTYPE_V2DF_V4SF:
+    case V2DF_FTYPE_V2DF_V2DI:
     case V2DF_FTYPE_V2DF_DI:
     case V2DF_FTYPE_V2DF_SI:
     case V2SF_FTYPE_V2SF_V2SF:
@@ -22249,22 +23450,35 @@ ix86_expand_args_builtin (const struct builtin_description *d,
       nargs_constant = 1;
       break;
     case V8HI_FTYPE_V8HI_INT:
+    case V8SF_FTYPE_V8SF_INT:
     case V4SI_FTYPE_V4SI_INT:
+    case V4SI_FTYPE_V8SI_INT:
     case V4HI_FTYPE_V4HI_INT:
+    case V4DF_FTYPE_V4DF_INT:
     case V4SF_FTYPE_V4SF_INT:
+    case V4SF_FTYPE_V8SF_INT:
     case V2DI_FTYPE_V2DI_INT:
     case V2DF_FTYPE_V2DF_INT:
+    case V2DF_FTYPE_V4DF_INT:
       nargs = 2;
       nargs_constant = 1;
       break;
     case V16QI_FTYPE_V16QI_V16QI_V16QI:
+    case V8SF_FTYPE_V8SF_V8SF_V8SF:
+    case V4DF_FTYPE_V4DF_V4DF_V4DF:
     case V4SF_FTYPE_V4SF_V4SF_V4SF:
     case V2DF_FTYPE_V2DF_V2DF_V2DF:
       nargs = 3;
       break;
     case V16QI_FTYPE_V16QI_V16QI_INT:
     case V8HI_FTYPE_V8HI_V8HI_INT:
+    case V8SI_FTYPE_V8SI_V8SI_INT:
+    case V8SI_FTYPE_V8SI_V4SI_INT:
+    case V8SF_FTYPE_V8SF_V8SF_INT: 
+    case V8SF_FTYPE_V8SF_V4SF_INT: 
     case V4SI_FTYPE_V4SI_V4SI_INT:
+    case V4DF_FTYPE_V4DF_V4DF_INT:
+    case V4DF_FTYPE_V4DF_V2DF_INT:
     case V4SF_FTYPE_V4SF_V4SF_INT:
     case V2DI_FTYPE_V2DI_V2DI_INT:
     case V2DF_FTYPE_V2DF_V2DF_INT:
@@ -22285,6 +23499,13 @@ ix86_expand_args_builtin (const struct builtin_description *d,
       nargs = 3;
       nargs_constant = 2;
       break;
+    case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
+    case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
+    case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
+    case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
+      nargs = 4;
+      nargs_constant = 1;
+      break;
     case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
       nargs = 4;
       nargs_constant = 2;
@@ -22345,13 +23566,40 @@ ix86_expand_args_builtin (const struct builtin_description *d,
 	      case CODE_FOR_sse4_1_roundsd:
 	      case CODE_FOR_sse4_1_roundss:
 	      case CODE_FOR_sse4_1_blendps:
+	      case CODE_FOR_avx_blendpd256:
+	      case CODE_FOR_avx_vpermilv4df:
+	      case CODE_FOR_avx_roundpd256:
+	      case CODE_FOR_avx_roundps256:
 		error ("the last argument must be a 4-bit immediate");
 		return const0_rtx;
 
 	      case CODE_FOR_sse4_1_blendpd:
+	      case CODE_FOR_avx_vpermilv2df:
+	      case CODE_FOR_avx_vpermil2v2df3:
+	      case CODE_FOR_avx_vpermil2v4sf3:
+	      case CODE_FOR_avx_vpermil2v4df3:
+	      case CODE_FOR_avx_vpermil2v8sf3:
 		error ("the last argument must be a 2-bit immediate");
 		return const0_rtx;
 
+	      case CODE_FOR_avx_vextractf128v4df:
+	      case CODE_FOR_avx_vextractf128v8sf:
+	      case CODE_FOR_avx_vextractf128v8si:
+	      case CODE_FOR_avx_vinsertf128v4df:
+	      case CODE_FOR_avx_vinsertf128v8sf:
+	      case CODE_FOR_avx_vinsertf128v8si:
+		error ("the last argument must be a 1-bit immediate");
+		return const0_rtx;
+
+	      case CODE_FOR_avx_cmpsdv2df3:
+	      case CODE_FOR_avx_cmpssv4sf3:
+	      case CODE_FOR_avx_cmppdv2df3:
+	      case CODE_FOR_avx_cmppsv4sf3:
+	      case CODE_FOR_avx_cmppdv4df3:
+	      case CODE_FOR_avx_cmppsv8sf3:
+		error ("the last argument must be a 5-bit immediate");
+		return const0_rtx;
+
 	     default:
 		switch (nargs_constant)
 		  {
@@ -22450,8 +23698,13 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
       emit_insn (GEN_FCN (icode) (target));
       return 0;
     case V2DI_FTYPE_PV2DI:
+    case V32QI_FTYPE_PCCHAR:
     case V16QI_FTYPE_PCCHAR:
+    case V8SF_FTYPE_PCV4SF:
+    case V8SF_FTYPE_PCFLOAT:
     case V4SF_FTYPE_PCFLOAT:
+    case V4DF_FTYPE_PCV2DF:
+    case V4DF_FTYPE_PCDOUBLE:
     case V2DF_FTYPE_PCDOUBLE:
       nargs = 1;
       klass = load;
@@ -22459,8 +23712,11 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
       break;
     case VOID_FTYPE_PV2SF_V4SF:
     case VOID_FTYPE_PV2DI_V2DI:
+    case VOID_FTYPE_PCHAR_V32QI:
     case VOID_FTYPE_PCHAR_V16QI:
+    case VOID_FTYPE_PFLOAT_V8SF:
     case VOID_FTYPE_PFLOAT_V4SF:
+    case VOID_FTYPE_PDOUBLE_V4DF:
     case VOID_FTYPE_PDOUBLE_V2DF:
     case VOID_FTYPE_PDI_DI:
     case VOID_FTYPE_PINT_INT:
@@ -22475,6 +23731,23 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
       klass = load;
       memory = 1;
       break;
+    case V8SF_FTYPE_PCV8SF_V8SF:
+    case V4DF_FTYPE_PCV4DF_V4DF:
+    case V4SF_FTYPE_PCV4SF_V4SF:
+    case V2DF_FTYPE_PCV2DF_V2DF:
+      nargs = 2;
+      klass = load;
+      memory = 0;
+      break;
+    case VOID_FTYPE_PV8SF_V8SF_V8SF:
+    case VOID_FTYPE_PV4DF_V4DF_V4DF:
+    case VOID_FTYPE_PV4SF_V4SF_V4SF:
+    case VOID_FTYPE_PV2DF_V2DF_V2DF:
+      nargs = 2;
+      klass = store;
+      /* Reserve memory operand for target.  */
+      memory = ARRAY_SIZE (args);
+      break;
     default:
       gcc_unreachable ();
     }
@@ -23761,8 +25034,11 @@ ix86_hard_regno_mode_ok (int regno, enum machine_mode mode)
     {
       /* We implement the move patterns for all vector modes into and
 	 out of SSE registers, even when no operation instructions
-	 are available.  */
-      return (VALID_SSE_REG_MODE (mode)
+	 are available.  OImode move is available only when AVX is
+	 enabled.  */
+      return ((TARGET_AVX && mode == OImode)
+	      || VALID_AVX256_REG_MODE (mode)
+	      || VALID_SSE_REG_MODE (mode)
 	      || VALID_SSE2_REG_MODE (mode)
 	      || VALID_MMX_REG_MODE (mode)
 	      || VALID_MMX_REG_MODE_3DNOW (mode));
@@ -24911,7 +26187,8 @@ extended_reg_mentioned_1 (rtx *p, void *data ATTRIBUTE_UNUSED)
 bool
 x86_extended_reg_mentioned_p (rtx insn)
 {
-  return for_each_rtx (&PATTERN (insn), extended_reg_mentioned_1, NULL);
+  return for_each_rtx (INSN_P (insn) ? &PATTERN (insn) : &insn,
+		       extended_reg_mentioned_1, NULL);
 }
 
 /* Generate an unsigned DImode/SImode to FP conversion.  This is the same code
@@ -24962,7 +26239,7 @@ static bool
 ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
 				   rtx target, rtx val)
 {
-  enum machine_mode smode, wsmode, wvmode;
+  enum machine_mode hmode, smode, wsmode, wvmode;
   rtx x;
 
   switch (mode)
@@ -25087,6 +26364,33 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, enum machine_mode mode,
       emit_move_insn (target, gen_lowpart (mode, x));
       return true;
 
+    case V4DFmode:
+      hmode = V2DFmode;
+      goto half;
+    case V4DImode:
+      hmode = V2DImode;
+      goto half;
+    case V8SFmode:
+      hmode = V4SFmode;
+      goto half;
+    case V8SImode:
+      hmode = V4SImode;
+      goto half;
+    case V16HImode:
+      hmode = V8HImode;
+      goto half;
+    case V32QImode:
+      hmode = V16QImode;
+      goto half;
+half:
+      {
+	rtx tmp = gen_reg_rtx (hmode);
+	ix86_expand_vector_init_duplicate (mmx_ok, hmode, tmp, val);
+	emit_insn (gen_rtx_SET (VOIDmode, target,
+				gen_rtx_VEC_CONCAT (mode, tmp, tmp)));
+      }
+      return true;
+
     default:
       return false;
     }
@@ -25127,6 +26431,14 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, enum machine_mode mode,
     case V4HImode:
       use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
       break;
+    case V32QImode:
+    case V16HImode:
+    case V8SImode:
+    case V8SFmode:
+    case V4DImode:
+    case V4DFmode:
+      use_vector_set = TARGET_AVX;
+      break;
     default:
       break;
     }
@@ -25265,6 +26577,12 @@ ix86_expand_vector_init_one_var (bool mmx_ok, enum machine_mode mode,
 	 the general case.  */
       return false;
 
+    case V4DFmode:
+    case V4DImode:
+    case V8SFmode:
+    case V8SImode:
+    case V16HImode:
+    case V32QImode:
     case V4SFmode:
     case V4SImode:
     case V8HImode:
@@ -25325,7 +26643,7 @@ ix86_expand_vector_init_concat (enum machine_mode mode,
 				rtx target, rtx *ops, int n)
 {
   enum machine_mode cmode, hmode = VOIDmode;
-  rtx first[4], second[2];
+  rtx first[8], second[4];
   rtvec v;
   int i, j;
 
@@ -25334,6 +26652,18 @@ ix86_expand_vector_init_concat (enum machine_mode mode,
     case 2:
       switch (mode)
 	{
+	case V8SImode:
+	  cmode = V4SImode;
+	  break;
+	case V8SFmode:
+	  cmode = V4SFmode;
+	  break;
+	case V4DImode:
+	  cmode = V2DImode;
+	  break;
+	case V4DFmode:
+	  cmode = V2DFmode;
+	  break;
 	case V4SImode:
 	  cmode = V2SImode;
 	  break;
@@ -25368,6 +26698,12 @@ ix86_expand_vector_init_concat (enum machine_mode mode,
     case 4:
       switch (mode)
 	{
+	case V4DImode:
+	  cmode = V2DImode;
+	  break;
+	case V4DFmode:
+	  cmode = V2DFmode;
+	  break;
 	case V4SImode:
 	  cmode = V2SImode;
 	  break;
@@ -25379,6 +26715,22 @@ ix86_expand_vector_init_concat (enum machine_mode mode,
 	}
       goto half;
 
+    case 8:
+      switch (mode)
+	{
+	case V8SImode:
+	  cmode = V2SImode;
+	  hmode = V4SImode;
+	  break;
+	case V8SFmode:
+	  cmode = V2SFmode;
+	  hmode = V4SFmode;
+	  break;
+	default:
+	  gcc_unreachable ();
+	}
+      goto half;
+
 half:
       /* FIXME: We process inputs backward to help RA.  PR 36222.  */
       i = n - 1;
@@ -25531,7 +26883,8 @@ static void
 ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
 				 rtx target, rtx vals)
 {
-  rtx ops[16];
+  rtx ops[32], op0, op1;
+  enum machine_mode half_mode = VOIDmode;
   int n, i;
 
   switch (mode)
@@ -25542,6 +26895,10 @@ ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
 	break;
       /* FALLTHRU */
 
+    case V8SFmode:
+    case V8SImode:
+    case V4DFmode:
+    case V4DImode:
     case V4SFmode:
     case V4SImode:
     case V2DFmode:
@@ -25552,6 +26909,28 @@ ix86_expand_vector_init_general (bool mmx_ok, enum machine_mode mode,
       ix86_expand_vector_init_concat (mode, target, ops, n);
       return;
 
+    case V32QImode:
+      half_mode = V16QImode;
+      goto half;
+
+    case V16HImode:
+      half_mode = V8HImode;
+      goto half;
+
+half:
+      n = GET_MODE_NUNITS (mode);
+      for (i = 0; i < n; i++)
+	ops[i] = XVECEXP (vals, 0, i);
+      op0 = gen_reg_rtx (half_mode);
+      op1 = gen_reg_rtx (half_mode);
+      ix86_expand_vector_init_interleave (half_mode, op0, ops,
+					  n >> 2);
+      ix86_expand_vector_init_interleave (half_mode, op1,
+					  &ops [n >> 1], n >> 2);
+      emit_insn (gen_rtx_SET (VOIDmode, target,
+			      gen_rtx_VEC_CONCAT (mode, op0, op1)));
+      return;
+
     case V16QImode:
       if (!TARGET_SSE4_1)
 	break;
@@ -25694,8 +27073,28 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
 {
   enum machine_mode mode = GET_MODE (target);
   enum machine_mode inner_mode = GET_MODE_INNER (mode);
+  enum machine_mode half_mode;
   bool use_vec_merge = false;
   rtx tmp;
+  static rtx (*gen_extract[6][2]) (rtx, rtx)
+    = {
+	{ gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
+	{ gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
+	{ gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
+	{ gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
+	{ gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
+	{ gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
+      };
+  static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
+    = {
+	{ gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
+	{ gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
+	{ gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
+	{ gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
+	{ gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
+	{ gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
+      };
+  int i, j, n;
 
   switch (mode)
     {
@@ -25843,6 +27242,62 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
       break;
 
     case V8QImode:
+      break;
+
+    case V32QImode:
+      half_mode = V16QImode;
+      j = 0;
+      n = 16;
+      goto half;
+
+    case V16HImode:
+      half_mode = V8HImode;
+      j = 1;
+      n = 8;
+      goto half;
+
+    case V8SImode:
+      half_mode = V4SImode;
+      j = 2;
+      n = 4;
+      goto half;
+
+    case V4DImode:
+      half_mode = V2DImode;
+      j = 3;
+      n = 2;
+      goto half;
+
+    case V8SFmode:
+      half_mode = V4SFmode;
+      j = 4;
+      n = 4;
+      goto half;
+
+    case V4DFmode:
+      half_mode = V2DFmode;
+      j = 5;
+      n = 2;
+      goto half;
+
+half:
+      /* Compute offset.  */
+      i = elt / n;
+      elt %= n;
+
+      gcc_assert (i <= 1);
+
+      /* Extract the half.  */
+      tmp = gen_reg_rtx (half_mode);
+      emit_insn ((*gen_extract[j][i]) (tmp, target));
+
+      /* Put val in tmp at elt.  */
+      ix86_expand_vector_set (false, tmp, val, elt);
+
+      /* Put it back.  */
+      emit_insn ((*gen_insert[j][i]) (target, target, tmp));
+      return;
+
     default:
       break;
     }
@@ -26044,6 +27499,8 @@ ix86_vector_mode_supported_p (enum machine_mode mode)
     return true;
   if (TARGET_SSE2 && VALID_SSE2_REG_MODE (mode))
     return true;
+  if (TARGET_AVX && VALID_AVX256_REG_MODE (mode))
+    return true;
   if (TARGET_MMX && VALID_MMX_REG_MODE (mode))
     return true;
   if (TARGET_3DNOW && VALID_MMX_REG_MODE_3DNOW (mode))
diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
index 69c7472..7ad7069 100644
--- a/gcc/config/i386/i386.h
+++ b/gcc/config/i386/i386.h
@@ -46,6 +46,8 @@ along with GCC; see the file COPYING3.  If not see
 #define TARGET_SSSE3	OPTION_ISA_SSSE3
 #define TARGET_SSE4_1	OPTION_ISA_SSE4_1
 #define TARGET_SSE4_2	OPTION_ISA_SSE4_2
+#define TARGET_AVX	OPTION_ISA_AVX
+#define TARGET_FMA	OPTION_ISA_FMA
 #define TARGET_SSE4A	OPTION_ISA_SSE4A
 #define TARGET_SSE5	OPTION_ISA_SSE5
 #define TARGET_ROUND	OPTION_ISA_ROUND
@@ -702,7 +704,7 @@ enum target_cpu_default
    Pentium+ prefers DFmode values to be aligned to 64 bit boundary
    and Pentium Pro XFmode values at 128 bit boundaries.  */
 
-#define BIGGEST_ALIGNMENT 128
+#define BIGGEST_ALIGNMENT (TARGET_AVX ? 256: 128)
 
 /* Maximum stack alignment.  */
 #define MAX_STACK_ALIGNMENT MAX_OFILE_ALIGNMENT
@@ -996,6 +998,10 @@ do {									\
 
 #define HARD_REGNO_NREGS_WITH_PADDING(REGNO, MODE) ((MODE) == XFmode ? 4 : 8)
 
+#define VALID_AVX256_REG_MODE(MODE)					\
+  ((MODE) == V32QImode || (MODE) == V16HImode || (MODE) == V8SImode	\
+   || (MODE) == V4DImode || (MODE) == V8SFmode || (MODE) == V4DFmode)
+
 #define VALID_SSE2_REG_MODE(MODE)					\
   ((MODE) == V16QImode || (MODE) == V8HImode || (MODE) == V2DFmode	\
    || (MODE) == V2DImode || (MODE) == DFmode)
@@ -1013,8 +1019,14 @@ do {									\
    || (MODE) == V4HImode || (MODE) == V8QImode)
 
 /* ??? No autovectorization into MMX or 3DNOW until we can reliably
-   place emms and femms instructions.  */
-#define UNITS_PER_SIMD_WORD(MODE) (TARGET_SSE ? 16 : UNITS_PER_WORD)
+   place emms and femms instructions.
+   FIXME: AVX has 32byte floating point vector operations and 16byte
+   integer vector operations.  But vectorizer doesn't support
+   different sizes for integer and floating point vectors.  We limit
+   vector size to 16byte.  */
+#define UNITS_PER_SIMD_WORD(MODE)					\
+  (TARGET_AVX ? (((MODE) == DFmode || (MODE) == SFmode) ? 16 : 16)	\
+   	      : (TARGET_SSE ? 16 : UNITS_PER_WORD))
 
 #define VALID_DFP_MODE_P(MODE) \
   ((MODE) == SDmode || (MODE) == DDmode || (MODE) == TDmode)
@@ -1035,7 +1047,9 @@ do {									\
 #define SSE_REG_MODE_P(MODE)						\
   ((MODE) == TImode || (MODE) == V16QImode || (MODE) == TFmode		\
    || (MODE) == V8HImode || (MODE) == V2DFmode || (MODE) == V2DImode	\
-   || (MODE) == V4SFmode || (MODE) == V4SImode)
+   || (MODE) == V4SFmode || (MODE) == V4SImode || (MODE) == V32QImode	\
+   || (MODE) == V16HImode || (MODE) == V8SImode || (MODE) == V4DImode	\
+   || (MODE) == V8SFmode || (MODE) == V4DFmode)
 
 /* Value is 1 if hard register REGNO can hold a value of machine-mode MODE.  */
 
@@ -1339,6 +1353,19 @@ enum reg_class
 #define SSE_VEC_FLOAT_MODE_P(MODE) \
   ((TARGET_SSE && (MODE) == V4SFmode) || (TARGET_SSE2 && (MODE) == V2DFmode))
 
+#define AVX_FLOAT_MODE_P(MODE) \
+  (TARGET_AVX && ((MODE) == SFmode || (MODE) == DFmode))
+
+#define AVX128_VEC_FLOAT_MODE_P(MODE) \
+  (TARGET_AVX && ((MODE) == V4SFmode || (MODE) == V2DFmode))
+
+#define AVX256_VEC_FLOAT_MODE_P(MODE) \
+  (TARGET_AVX && ((MODE) == V8SFmode || (MODE) == V4DFmode))
+
+#define AVX_VEC_FLOAT_MODE_P(MODE) \
+  (TARGET_AVX && ((MODE) == V4SFmode || (MODE) == V2DFmode \
+		  || (MODE) == V8SFmode || (MODE) == V4DFmode))
+
 #define MMX_REG_P(XOP) (REG_P (XOP) && MMX_REGNO_P (REGNO (XOP)))
 #define MMX_REGNO_P(N) IN_RANGE ((N), FIRST_MMX_REG, LAST_MMX_REG)
 
@@ -1559,6 +1586,7 @@ typedef struct ix86_args {
   int fastcall;			/* fastcall calling convention is used */
   int sse_words;		/* # sse words passed so far */
   int sse_nregs;		/* # sse registers available for passing */
+  int warn_avx;			/* True when we want to warn about AVX ABI.  */
   int warn_sse;			/* True when we want to warn about SSE ABI.  */
   int warn_mmx;			/* True when we want to warn about MMX ABI.  */
   int sse_regno;		/* next available sse register number */
@@ -2133,6 +2161,29 @@ do {									\
 #define ASM_OUTPUT_ADDR_DIFF_ELT(FILE, BODY, VALUE, REL) \
   ix86_output_addr_diff_elt ((FILE), (VALUE), (REL))
 
+/* When we see %v, we will print the 'v' prefix if TARGET_AVX is
+   true.  */
+
+#define ASM_OUTPUT_AVX_PREFIX(STREAM, PTR)	\
+{						\
+  if ((PTR)[0] == '%' && (PTR)[1] == 'v')	\
+    {						\
+      if (TARGET_AVX)				\
+	(PTR) += 1;				\
+      else					\
+	(PTR) += 2;				\
+    }						\
+}
+
+/* A C statement or statements which output an assembler instruction
+   opcode to the stdio stream STREAM.  The macro-operand PTR is a
+   variable of type `char *' which points to the opcode name in
+   its "internal" form--the form that is written in the machine
+   description.  */
+
+#define ASM_OUTPUT_OPCODE(STREAM, PTR) \
+  ASM_OUTPUT_AVX_PREFIX ((STREAM), (PTR))
+
 /* Under some conditions we need jump tables in the text section,
    because the assembler cannot handle label differences between
    sections.  This is the case for x86_64 on Mach-O for example.  */
diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
index 49fde6b..0a13751 100644
--- a/gcc/config/i386/i386.md
+++ b/gcc/config/i386/i386.md
@@ -195,6 +195,16 @@
 
    ; For PCLMUL support
    (UNSPEC_PCLMUL		165)
+
+   ; For AVX support
+   (UNSPEC_PCMP			166)
+   (UNSPEC_VPERMIL		167)
+   (UNSPEC_VPERMIL2		168)
+   (UNSPEC_VPERMIL2F128		169)
+   (UNSPEC_MASKLOAD		170)
+   (UNSPEC_MASKSTORE		171)
+   (UNSPEC_CAST			172)
+   (UNSPEC_VTESTP		173)
   ])
 
 (define_constants
@@ -214,6 +224,8 @@
    (UNSPECV_LOCK		13)
    (UNSPECV_PROLOGUE_USE	14)
    (UNSPECV_CLD			15)
+   (UNSPECV_VZEROALL		16)
+   (UNSPECV_VZEROUPPER		17)
   ])
 
 ;; Constants to represent pcomtrue/pcomfalse variants
@@ -253,9 +265,25 @@
    (FLAGS_REG			17)
    (FPSR_REG			18)
    (FPCR_REG			19)
+   (XMM0_REG			21)
+   (XMM1_REG			22)
+   (XMM2_REG			23)
+   (XMM3_REG			24)
+   (XMM4_REG			25)
+   (XMM5_REG			26)
+   (XMM6_REG			27)
+   (XMM7_REG			28)
    (R10_REG			39)
    (R11_REG			40)
    (R13_REG			42)
+   (XMM8_REG			45)
+   (XMM9_REG			46)
+   (XMM10_REG			47)
+   (XMM11_REG			48)
+   (XMM12_REG			49)
+   (XMM13_REG			50)
+   (XMM14_REG			51)
+   (XMM15_REG			52)
   ])
 
 ;; Insns whose names begin with "x86_" are emitted by gen_FOO calls
@@ -291,7 +319,7 @@
 
 ;; Main data type used by the insn
 (define_attr "mode"
-  "unknown,none,QI,HI,SI,DI,TI,SF,DF,XF,TF,V4SF,V2DF,V2SF,V1DF"
+  "unknown,none,QI,HI,SI,DI,TI,OI,SF,DF,XF,TF,V8SF,V4DF,V4SF,V2DF,V2SF,V1DF"
   (const_string "unknown"))
 
 ;; The CPU unit operations uses.
@@ -388,6 +416,28 @@
 ;; There are also additional prefixes in SSSE3.
 (define_attr "prefix_extra" "" (const_int 0))
 
+;; Prefix used: original, VEX or maybe VEX.
+(define_attr "prefix" "orig,vex,maybe_vex"
+  (if_then_else (eq_attr "mode" "OI,V8SF,V4DF")
+    (const_string "vex")
+    (const_string "orig")))
+
+;; There is a 8bit immediate for VEX.
+(define_attr "prefix_vex_imm8" "" (const_int 0))
+
+;; VEX W bit is used.
+(define_attr "prefix_vex_w" "" (const_int 0))
+
+;; The length of VEX prefix
+(define_attr "length_vex" ""
+  (if_then_else (eq_attr "prefix_0f" "1")
+    (if_then_else (eq_attr "prefix_vex_w" "1")
+      (symbol_ref "ix86_attr_length_vex_default (insn, 1, 1)")
+      (symbol_ref "ix86_attr_length_vex_default (insn, 1, 0)"))
+    (if_then_else (eq_attr "prefix_vex_w" "1")
+      (symbol_ref "ix86_attr_length_vex_default (insn, 0, 1)")
+      (symbol_ref "ix86_attr_length_vex_default (insn, 0, 0)"))))
+
 ;; Set when modrm byte is used.
 (define_attr "modrm" ""
   (cond [(eq_attr "type" "str,leave")
@@ -433,7 +483,14 @@
 	 (eq_attr "unit" "i387")
 	   (plus (const_int 2)
 		 (plus (attr "prefix_data16")
-		       (attr "length_address")))]
+		       (attr "length_address")))
+	 (ior (eq_attr "prefix" "vex")
+	      (and (eq_attr "prefix" "maybe_vex")
+		    (ne (symbol_ref "TARGET_AVX") (const_int 0))))
+	   (plus (attr "length_vex")
+		 (plus (attr "prefix_vex_imm8")
+		       (plus (attr "modrm")
+			     (attr "length_address"))))]
 	 (plus (plus (attr "modrm")
 		     (plus (attr "prefix_0f")
 			   (plus (attr "prefix_rex")
@@ -572,6 +629,9 @@
 ;; Mapping of unsigned max and min
 (define_code_iterator umaxmin [umax umin])
 
+;; Mapping of signed/unsigned max and min
+(define_code_iterator maxmin [smax smin umax umin])
+
 ;; Base name for integer and FP insn mnemonic
 (define_code_attr maxminiprefix [(smax "maxs") (smin "mins")
 				 (umax "maxu") (umin "minu")])
@@ -1254,6 +1314,7 @@
    && GET_MODE (operands[0]) == GET_MODE (operands[1])"
   "* return output_fp_compare (insn, operands, 1, 0);"
   [(set_attr "type" "fcmp,ssecomi")
+   (set_attr "prefix" "orig,maybe_vex")
    (set (attr "mode")
      (if_then_else (match_operand:SF 1 "" "")
         (const_string "SF")
@@ -1270,6 +1331,7 @@
    && GET_MODE (operands[0]) == GET_MODE (operands[1])"
   "* return output_fp_compare (insn, operands, 1, 0);"
   [(set_attr "type" "ssecomi")
+   (set_attr "prefix" "maybe_vex")
    (set (attr "mode")
      (if_then_else (match_operand:SF 1 "" "")
         (const_string "SF")
@@ -1306,6 +1368,7 @@
    && GET_MODE (operands[0]) == GET_MODE (operands[1])"
   "* return output_fp_compare (insn, operands, 1, 1);"
   [(set_attr "type" "fcmp,ssecomi")
+   (set_attr "prefix" "orig,maybe_vex")
    (set (attr "mode")
      (if_then_else (match_operand:SF 1 "" "")
         (const_string "SF")
@@ -1322,6 +1385,7 @@
    && GET_MODE (operands[0]) == GET_MODE (operands[1])"
   "* return output_fp_compare (insn, operands, 1, 1);"
   [(set_attr "type" "ssecomi")
+   (set_attr "prefix" "maybe_vex")
    (set (attr "mode")
      (if_then_else (match_operand:SF 1 "" "")
         (const_string "SF")
@@ -1450,20 +1514,20 @@
     {
     case TYPE_SSELOG1:
       if (get_attr_mode (insn) == MODE_TI)
-        return "pxor\t%0, %0";
-      return "xorps\t%0, %0";
+        return "%vpxor\t%0, %d0";
+      return "%vxorps\t%0, %d0";
 
     case TYPE_SSEMOV:
       switch (get_attr_mode (insn))
 	{
 	case MODE_TI:
-	  return "movdqa\t{%1, %0|%0, %1}";
+	  return "%vmovdqa\t{%1, %0|%0, %1}";
 	case MODE_V4SF:
-	  return "movaps\t{%1, %0|%0, %1}";
+	  return "%vmovaps\t{%1, %0|%0, %1}";
 	case MODE_SI:
-          return "movd\t{%1, %0|%0, %1}";
+          return "%vmovd\t{%1, %0|%0, %1}";
 	case MODE_SF:
-          return "movss\t{%1, %0|%0, %1}";
+          return "%vmovss\t{%1, %0|%0, %1}";
 	default:
 	  gcc_unreachable ();
 	}
@@ -1497,6 +1561,10 @@
 	      (const_string "lea")
 	   ]
 	   (const_string "imov")))
+   (set (attr "prefix")
+     (if_then_else (eq_attr "alternative" "0,1,2,3,4,5")
+       (const_string "orig")
+       (const_string "maybe_vex")))
    (set (attr "mode")
      (cond [(eq_attr "alternative" "2,3")
 	      (const_string "DI")
@@ -2225,15 +2293,19 @@
    pxor\t%0, %0
    movq\t{%1, %0|%0, %1}
    movq\t{%1, %0|%0, %1}
-   pxor\t%0, %0
-   movq\t{%1, %0|%0, %1}
-   movdqa\t{%1, %0|%0, %1}
-   movq\t{%1, %0|%0, %1}
+   %vpxor\t%0, %d0
+   %vmovq\t{%1, %0|%0, %1}
+   %vmovdqa\t{%1, %0|%0, %1}
+   %vmovq\t{%1, %0|%0, %1}
    xorps\t%0, %0
    movlps\t{%1, %0|%0, %1}
    movaps\t{%1, %0|%0, %1}
    movlps\t{%1, %0|%0, %1}"
   [(set_attr "type" "*,*,mmx,mmxmov,mmxmov,sselog1,ssemov,ssemov,ssemov,sselog1,ssemov,ssemov,ssemov")
+   (set (attr "prefix")
+     (if_then_else (eq_attr "alternative" "5,6,7,8")
+       (const_string "vex")
+       (const_string "orig")))
    (set_attr "mode" "DI,DI,DI,DI,DI,TI,DI,TI,DI,V4SF,V2SF,V4SF,V2SF")])
 
 (define_split
@@ -2270,6 +2342,14 @@
 	return "movdq2q\t{%1, %0|%0, %1}";
 
     case TYPE_SSEMOV:
+      if (TARGET_AVX)
+	{
+	  if (get_attr_mode (insn) == MODE_TI)
+	    return "vmovdqa\t{%1, %0|%0, %1}";
+	  else
+	    return "vmovq\t{%1, %0|%0, %1}";
+	}
+
       if (get_attr_mode (insn) == MODE_TI)
 	return "movdqa\t{%1, %0|%0, %1}";
       /* FALLTHRU */
@@ -2282,6 +2362,8 @@
       return "movq\t{%1, %0|%0, %1}";
 
     case TYPE_SSELOG1:
+      return "%vpxor\t%0, %d0";
+
     case TYPE_MMXADD:
       return "pxor\t%0, %0";
 
@@ -2320,6 +2402,10 @@
 	   (const_string "imov")))
    (set_attr "modrm" "*,0,0,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*")
    (set_attr "length_immediate" "*,4,8,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*,*")
+   (set (attr "prefix")
+     (if_then_else (eq_attr "alternative" "11,12,13,14,15,16")
+       (const_string "maybe_vex")
+       (const_string "orig")))
    (set_attr "mode" "SI,DI,DI,DI,SI,DI,DI,DI,DI,DI,DI,TI,TI,DI,DI,DI,DI,DI,DI")])
 
 ;; Stores and loads of ax to arbitrary constant address.
@@ -2402,6 +2488,37 @@
    (set_attr "athlon_decode" "vector")
    (set_attr "amdfam10_decode" "double")])
 
+(define_expand "movoi"
+  [(set (match_operand:OI 0 "nonimmediate_operand" "")
+	(match_operand:OI 1 "general_operand" ""))]
+  "TARGET_AVX"
+  "ix86_expand_move (OImode, operands); DONE;")
+
+(define_insn "*movoi_internal"
+  [(set (match_operand:OI 0 "nonimmediate_operand" "=x,x,m")
+	(match_operand:OI 1 "vector_move_operand" "C,xm,x"))]
+  "TARGET_AVX
+   && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "vxorps\t%0, %0, %0";
+    case 1:
+    case 2:
+      if (misaligned_operand (operands[0], OImode)
+	  || misaligned_operand (operands[1], OImode))
+	return "vmovdqu\t{%1, %0|%0, %1}";
+      else
+	return "vmovdqa\t{%1, %0|%0, %1}";
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "sselog1,ssemov,ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_expand "movti"
   [(set (match_operand:TI 0 "nonimmediate_operand" "")
 	(match_operand:TI 1 "nonimmediate_operand" ""))]
@@ -2426,9 +2543,9 @@
     {
     case 0:
       if (get_attr_mode (insn) == MODE_V4SF)
-	return "xorps\t%0, %0";
+	return "%vxorps\t%0, %d0";
       else
-	return "pxor\t%0, %0";
+	return "%vpxor\t%0, %d0";
     case 1:
     case 2:
       /* TDmode values are passed as TImode on the stack.  Moving them
@@ -2437,22 +2554,23 @@
 	  || misaligned_operand (operands[1], TImode))
 	{ 
 	  if (get_attr_mode (insn) == MODE_V4SF)
-	    return "movups\t{%1, %0|%0, %1}";
+	    return "%vmovups\t{%1, %0|%0, %1}";
 	 else
-	   return "movdqu\t{%1, %0|%0, %1}";
+	   return "%vmovdqu\t{%1, %0|%0, %1}";
 	}
       else
 	{ 
 	  if (get_attr_mode (insn) == MODE_V4SF)
-	    return "movaps\t{%1, %0|%0, %1}";
+	    return "%vmovaps\t{%1, %0|%0, %1}";
 	 else
-	   return "movdqa\t{%1, %0|%0, %1}";
+	   return "%vmovdqa\t{%1, %0|%0, %1}";
 	}
     default:
       gcc_unreachable ();
     }
 }
   [(set_attr "type" "sselog1,ssemov,ssemov")
+   (set_attr "prefix" "maybe_vex")
    (set (attr "mode")
 	(cond [(ior (eq (symbol_ref "TARGET_SSE2") (const_int 0))
 		    (ne (symbol_ref "optimize_size") (const_int 0)))
@@ -2476,9 +2594,9 @@
       return "#";
     case 2:
       if (get_attr_mode (insn) == MODE_V4SF)
-	return "xorps\t%0, %0";
+	return "%vxorps\t%0, %d0";
       else
-	return "pxor\t%0, %0";
+	return "%vpxor\t%0, %d0";
     case 3:
     case 4:
       /* TDmode values are passed as TImode on the stack.  Moving them
@@ -2487,22 +2605,23 @@
 	  || misaligned_operand (operands[1], TImode))
 	{ 
 	  if (get_attr_mode (insn) == MODE_V4SF)
-	    return "movups\t{%1, %0|%0, %1}";
+	    return "%vmovups\t{%1, %0|%0, %1}";
 	 else
-	   return "movdqu\t{%1, %0|%0, %1}";
+	   return "%vmovdqu\t{%1, %0|%0, %1}";
 	}
       else
 	{ 
 	  if (get_attr_mode (insn) == MODE_V4SF)
-	    return "movaps\t{%1, %0|%0, %1}";
+	    return "%vmovaps\t{%1, %0|%0, %1}";
 	 else
-	   return "movdqa\t{%1, %0|%0, %1}";
+	   return "%vmovdqa\t{%1, %0|%0, %1}";
 	}
     default:
       gcc_unreachable ();
     }
 }
   [(set_attr "type" "*,*,sselog1,ssemov,ssemov")
+   (set_attr "prefix" "*,*,maybe_vex,maybe_vex,maybe_vex")
    (set (attr "mode")
         (cond [(eq_attr "alternative" "2,3")
 		 (if_then_else
@@ -2628,20 +2747,27 @@
       return "mov{l}\t{%1, %0|%0, %1}";
     case 5:
       if (get_attr_mode (insn) == MODE_TI)
-	return "pxor\t%0, %0";
+	return "%vpxor\t%0, %d0";
       else
-	return "xorps\t%0, %0";
+	return "%vxorps\t%0, %d0";
     case 6:
       if (get_attr_mode (insn) == MODE_V4SF)
-	return "movaps\t{%1, %0|%0, %1}";
+	return "%vmovaps\t{%1, %0|%0, %1}";
+      else
+	return "%vmovss\t{%1, %d0|%d0, %1}";
+    case 7:
+      if (TARGET_AVX)
+	return REG_P (operands[1]) ? "vmovss\t{%1, %0, %0|%0, %0, %1}"
+				   : "vmovss\t{%1, %0|%0, %1}";
       else
 	return "movss\t{%1, %0|%0, %1}";
-    case 7: case 8:
-      return "movss\t{%1, %0|%0, %1}";
+    case 8:
+      return "%vmovss\t{%1, %0|%0, %1}";
 
-    case 9: case 10:
-    case 12: case 13: case 14: case 15:
+    case 9: case 10: case 14: case 15:
       return "movd\t{%1, %0|%0, %1}";
+    case 12: case 13:
+      return "%vmovd\t{%1, %0|%0, %1}";
 
     case 11:
       return "movq\t{%1, %0|%0, %1}";
@@ -2651,6 +2777,10 @@
     }
 }
   [(set_attr "type" "fmov,fmov,fmov,imov,imov,sselog1,ssemov,ssemov,ssemov,mmxmov,mmxmov,mmxmov,ssemov,ssemov,mmxmov,mmxmov")
+   (set (attr "prefix")
+     (if_then_else (eq_attr "alternative" "5,6,7,8,12,13")
+       (const_string "maybe_vex")
+       (const_string "orig")))
    (set (attr "mode")
         (cond [(eq_attr "alternative" "3,4,9,10")
 		 (const_string "SI")
@@ -2790,11 +2920,11 @@
       switch (get_attr_mode (insn))
 	{
 	case MODE_V4SF:
-	  return "xorps\t%0, %0";
+	  return "%vxorps\t%0, %d0";
 	case MODE_V2DF:
-	  return "xorpd\t%0, %0";
+	  return "%vxorpd\t%0, %d0";
 	case MODE_TI:
-	  return "pxor\t%0, %0";
+	  return "%vpxor\t%0, %d0";
 	default:
 	  gcc_unreachable ();
 	}
@@ -2804,19 +2934,43 @@
       switch (get_attr_mode (insn))
 	{
 	case MODE_V4SF:
-	  return "movaps\t{%1, %0|%0, %1}";
+	  return "%vmovaps\t{%1, %0|%0, %1}";
 	case MODE_V2DF:
-	  return "movapd\t{%1, %0|%0, %1}";
+	  return "%vmovapd\t{%1, %0|%0, %1}";
 	case MODE_TI:
-	  return "movdqa\t{%1, %0|%0, %1}";
+	  return "%vmovdqa\t{%1, %0|%0, %1}";
 	case MODE_DI:
-	  return "movq\t{%1, %0|%0, %1}";
+	  return "%vmovq\t{%1, %0|%0, %1}";
 	case MODE_DF:
-	  return "movsd\t{%1, %0|%0, %1}";
+	  if (TARGET_AVX)
+	    {
+	      if (REG_P (operands[0]) && REG_P (operands[1]))
+		return "vmovsd\t{%1, %0, %0|%0, %0, %1}";
+	      else
+		return "vmovsd\t{%1, %0|%0, %1}";
+	    }
+	  else
+	    return "movsd\t{%1, %0|%0, %1}";
 	case MODE_V1DF:
-	  return "movlpd\t{%1, %0|%0, %1}";
+	  if (TARGET_AVX)
+	    {
+	      if (REG_P (operands[0]))
+		return "vmovlpd\t{%1, %0, %0|%0, %0, %1}";
+	      else
+		return "vmovlpd\t{%1, %0|%0, %1}";
+	    }
+	  else
+	    return "movlpd\t{%1, %0|%0, %1}";
 	case MODE_V2SF:
-	  return "movlps\t{%1, %0|%0, %1}";
+	  if (TARGET_AVX)
+	    {
+	      if (REG_P (operands[0]))
+		return "vmovlps\t{%1, %0, %0|%0, %0, %1}";
+	      else
+		return "vmovlps\t{%1, %0|%0, %1}";
+	    }
+	  else
+	    return "movlps\t{%1, %0|%0, %1}";
 	default:
 	  gcc_unreachable ();
 	}
@@ -2826,6 +2980,10 @@
     }
 }
   [(set_attr "type" "fmov,fmov,fmov,multi,multi,sselog1,ssemov,ssemov,ssemov")
+   (set (attr "prefix")
+     (if_then_else (eq_attr "alternative" "0,1,2,3,4")
+       (const_string "orig")
+       (const_string "maybe_vex")))
    (set (attr "mode")
         (cond [(eq_attr "alternative" "0,1,2")
 		 (const_string "DF")
@@ -2907,11 +3065,11 @@
       switch (get_attr_mode (insn))
 	{
 	case MODE_V4SF:
-	  return "xorps\t%0, %0";
+	  return "%vxorps\t%0, %d0";
 	case MODE_V2DF:
-	  return "xorpd\t%0, %0";
+	  return "%vxorpd\t%0, %d0";
 	case MODE_TI:
-	  return "pxor\t%0, %0";
+	  return "%vpxor\t%0, %d0";
 	default:
 	  gcc_unreachable ();
 	}
@@ -2921,32 +3079,44 @@
       switch (get_attr_mode (insn))
 	{
 	case MODE_V4SF:
-	  return "movaps\t{%1, %0|%0, %1}";
+	  return "%vmovaps\t{%1, %0|%0, %1}";
 	case MODE_V2DF:
-	  return "movapd\t{%1, %0|%0, %1}";
+	  return "%vmovapd\t{%1, %0|%0, %1}";
 	case MODE_TI:
-	  return "movdqa\t{%1, %0|%0, %1}";
+	  return "%vmovdqa\t{%1, %0|%0, %1}";
 	case MODE_DI:
-	  return "movq\t{%1, %0|%0, %1}";
+	  return "%vmovq\t{%1, %0|%0, %1}";
 	case MODE_DF:
-	  return "movsd\t{%1, %0|%0, %1}";
+	  if (TARGET_AVX)
+	    {
+	      if (REG_P (operands[0]) && REG_P (operands[1]))
+		return "vmovsd\t{%1, %0, %0|%0, %0, %1}";
+	      else
+		return "vmovsd\t{%1, %0|%0, %1}";
+	    }
+	  else
+	    return "movsd\t{%1, %0|%0, %1}";
 	case MODE_V1DF:
-	  return "movlpd\t{%1, %0|%0, %1}";
+	  return "%vmovlpd\t{%1, %d0|%d0, %1}";
 	case MODE_V2SF:
-	  return "movlps\t{%1, %0|%0, %1}";
+	  return "%vmovlps\t{%1, %d0|%d0, %1}";
 	default:
 	  gcc_unreachable ();
 	}
 
     case 9:
     case 10:
-      return "movd\t{%1, %0|%0, %1}";
+    return "%vmovd\t{%1, %0|%0, %1}";
 
     default:
       gcc_unreachable();
     }
 }
   [(set_attr "type" "fmov,fmov,fmov,multi,multi,sselog1,ssemov,ssemov,ssemov,ssemov,ssemov")
+   (set (attr "prefix")
+     (if_then_else (eq_attr "alternative" "0,1,2,3,4")
+       (const_string "orig")
+       (const_string "maybe_vex")))
    (set (attr "mode")
         (cond [(eq_attr "alternative" "0,1,2")
 		 (const_string "DF")
@@ -3278,14 +3448,14 @@
     case 0:
     case 1:
       if (get_attr_mode (insn) == MODE_V4SF)
-	return "movaps\t{%1, %0|%0, %1}";
+	return "%vmovaps\t{%1, %0|%0, %1}";
       else
-	return "movdqa\t{%1, %0|%0, %1}";
+	return "%vmovdqa\t{%1, %0|%0, %1}";
     case 2:
       if (get_attr_mode (insn) == MODE_V4SF)
-	return "xorps\t%0, %0";
+	return "%vxorps\t%0, %d0";
       else
-	return "pxor\t%0, %0";
+	return "%vpxor\t%0, %d0";
     case 3:
     case 4:
 	return "#";
@@ -3294,6 +3464,7 @@
     }
 }
   [(set_attr "type" "ssemov,ssemov,sselog1,*,*")
+   (set_attr "prefix" "maybe_vex,maybe_vex,maybe_vex,*,*")
    (set (attr "mode")
         (cond [(eq_attr "alternative" "0,2")
 		 (if_then_else
@@ -3669,10 +3840,11 @@
    #
    movd\t{%1, %0|%0, %1}
    movd\t{%1, %0|%0, %1}
-   movd\t{%1, %0|%0, %1}
-   movd\t{%1, %0|%0, %1}"
-  [(set_attr "mode" "SI,SI,SI,DI,DI,TI,TI")
-   (set_attr "type" "multi,multi,multi,mmxmov,mmxmov,ssemov,ssemov")])
+   %vmovd\t{%1, %0|%0, %1}
+   %vmovd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "multi,multi,multi,mmxmov,mmxmov,ssemov,ssemov")
+   (set_attr "prefix" "*,*,*,orig,orig,maybe_vex,maybe_vex")
+   (set_attr "mode" "SI,SI,SI,DI,DI,TI,TI")])
 
 (define_insn "zero_extendsidi2_rex64"
   [(set (match_operand:DI 0 "nonimmediate_operand" "=r,o,?*Ym,?*y,?*Yi,*Y2")
@@ -3684,9 +3856,10 @@
    #
    movd\t{%1, %0|%0, %1}
    movd\t{%1, %0|%0, %1}
-   movd\t{%1, %0|%0, %1}
-   movd\t{%1, %0|%0, %1}"
+   %vmovd\t{%1, %0|%0, %1}
+   %vmovd\t{%1, %0|%0, %1}"
   [(set_attr "type" "imovx,imov,mmxmov,mmxmov,ssemov,ssemov")
+   (set_attr "prefix" "orig,*,orig,orig,maybe_vex,maybe_vex")
    (set_attr "mode" "SI,DI,DI,DI,TI,TI")])
 
 (define_split
@@ -4071,21 +4244,23 @@
       return output_387_reg_move (insn, operands);
 
     case 2:
-      return "cvtss2sd\t{%1, %0|%0, %1}";
+      return "%vcvtss2sd\t{%1, %d0|%d0, %1}";
 
     default:
       gcc_unreachable ();
     }
 }
   [(set_attr "type" "fmov,fmov,ssecvt")
+   (set_attr "prefix" "orig,orig,maybe_vex")
    (set_attr "mode" "SF,XF,DF")])
 
 (define_insn "*extendsfdf2_sse"
   [(set (match_operand:DF 0 "nonimmediate_operand" "=x")
         (float_extend:DF (match_operand:SF 1 "nonimmediate_operand" "xm")))]
   "TARGET_SSE2 && TARGET_SSE_MATH"
-  "cvtss2sd\t{%1, %0|%0, %1}"
+  "%vcvtss2sd\t{%1, %d0|%d0, %1}"
   [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "DF")])
 
 (define_insn "*extendsfdf2_i387"
@@ -4214,12 +4389,13 @@
     case 0:
       return output_387_reg_move (insn, operands);
     case 1:
-      return "cvtsd2ss\t{%1, %0|%0, %1}";
+      return "%vcvtsd2ss\t{%1, %d0|%d0, %1}";
     default:
       gcc_unreachable ();
     }
 }
   [(set_attr "type" "fmov,ssecvt")
+   (set_attr "prefix" "orig,maybe_vex")
    (set_attr "mode" "SF")])
 
 ;; Yes, this one doesn't depend on flag_unsafe_math_optimizations,
@@ -4229,8 +4405,9 @@
         (float_truncate:SF
           (match_operand:DF 1 "nonimmediate_operand" "xm")))]
   "TARGET_SSE2 && TARGET_SSE_MATH"
-  "cvtsd2ss\t{%1, %0|%0, %1}"
+  "%vcvtsd2ss\t{%1, %d0|%d0, %1}"
   [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "SF")])
 
 (define_insn "*truncdfsf_fast_i387"
@@ -4257,13 +4434,14 @@
     case 1:
       return "#";
     case 2:
-      return "cvtsd2ss\t{%1, %0|%0, %1}";
+      return "%vcvtsd2ss\t{%1, %d0|%d0, %1}";
     default:
       gcc_unreachable ();
     }
 }
   [(set_attr "type" "fmov,multi,ssecvt")
    (set_attr "unit" "*,i387,*")
+   (set_attr "prefix" "orig,orig,maybe_vex")
    (set_attr "mode" "SF")])
 
 (define_insn "*truncdfsf_i387"
@@ -4550,8 +4728,9 @@
 	(fix:DI (match_operand:MODEF 1 "nonimmediate_operand" "x,m")))]
   "TARGET_64BIT && SSE_FLOAT_MODE_P (<MODE>mode)
    && (!TARGET_FISTTP || TARGET_SSE_MATH)"
-  "cvtts<ssemodefsuffix>2si{q}\t{%1, %0|%0, %1}"
+  "%vcvtts<ssemodefsuffix>2si{q}\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "<MODE>")
    (set_attr "athlon_decode" "double,vector")
    (set_attr "amdfam10_decode" "double,double")])
@@ -4561,8 +4740,9 @@
 	(fix:SI (match_operand:MODEF 1 "nonimmediate_operand" "x,m")))]
   "SSE_FLOAT_MODE_P (<MODE>mode)
    && (!TARGET_FISTTP || TARGET_SSE_MATH)"
-  "cvtts<ssemodefsuffix>2si\t{%1, %0|%0, %1}"
+  "%vcvtts<ssemodefsuffix>2si\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "<MODE>")
    (set_attr "athlon_decode" "double,vector")
    (set_attr "amdfam10_decode" "double,double")])
@@ -5042,9 +5222,10 @@
    && (TARGET_INTER_UNIT_CONVERSIONS || optimize_size)"
   "@
    fild%z1\t%1
-   cvtsi2s<MODEF:ssemodefsuffix><SSEMODEI24:rex64suffix>\t{%1, %0|%0, %1}
-   cvtsi2s<MODEF:ssemodefsuffix><SSEMODEI24:rex64suffix>\t{%1, %0|%0, %1}"
+   %vcvtsi2s<MODEF:ssemodefsuffix><SSEMODEI24:rex64suffix>\t{%1, %d0|%d0, %1}
+   %vcvtsi2s<MODEF:ssemodefsuffix><SSEMODEI24:rex64suffix>\t{%1, %d0|%d0, %1}"
   [(set_attr "type" "fmov,sseicvt,sseicvt")
+   (set_attr "prefix" "orig,maybe_vex,maybe_vex")
    (set_attr "mode" "<MODEF:MODE>")
    (set_attr "unit" "i387,*,*")
    (set_attr "athlon_decode" "*,double,direct")
@@ -5060,8 +5241,9 @@
    && !(TARGET_INTER_UNIT_CONVERSIONS || optimize_size)"
   "@
    fild%z1\t%1
-   cvtsi2s<MODEF:ssemodefsuffix><SSEMODEI24:rex64suffix>\t{%1, %0|%0, %1}"
+   %vcvtsi2s<MODEF:ssemodefsuffix><SSEMODEI24:rex64suffix>\t{%1, %d0|%d0, %1}"
   [(set_attr "type" "fmov,sseicvt")
+   (set_attr "prefix" "orig,maybe_vex")
    (set_attr "mode" "<MODEF:MODE>")
    (set_attr "athlon_decode" "*,direct")
    (set_attr "amdfam10_decode" "*,double")
@@ -5232,8 +5414,9 @@
   "(<SSEMODEI24:MODE>mode != DImode || TARGET_64BIT)
    && SSE_FLOAT_MODE_P (<MODEF:MODE>mode) && TARGET_SSE_MATH
    && (TARGET_INTER_UNIT_CONVERSIONS || optimize_size)"
-  "cvtsi2s<MODEF:ssemodefsuffix><SSEMODEI24:rex64suffix>\t{%1, %0|%0, %1}"
+  "%vcvtsi2s<MODEF:ssemodefsuffix><SSEMODEI24:rex64suffix>\t{%1, %d0|%d0, %1}"
   [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "<MODEF:MODE>")
    (set_attr "athlon_decode" "double,direct")
    (set_attr "amdfam10_decode" "vector,double")
@@ -5260,8 +5443,9 @@
   "(<SSEMODEI24:MODE>mode != DImode || TARGET_64BIT)
    && SSE_FLOAT_MODE_P (<MODEF:MODE>mode) && TARGET_SSE_MATH
    && !(TARGET_INTER_UNIT_CONVERSIONS || optimize_size)"
-  "cvtsi2s<MODEF:ssemodefsuffix><SSEMODEI24:rex64suffix>\t{%1, %0|%0, %1}"
+  "%vcvtsi2s<MODEF:ssemodefsuffix><SSEMODEI24:rex64suffix>\t{%1, %d0|%d0, %1}"
   [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "<MODEF:MODE>")
    (set_attr "athlon_decode" "direct")
    (set_attr "amdfam10_decode" "double")
@@ -10862,6 +11046,19 @@
 ;; This pattern must be defined before *ashlti3_1 to prevent
 ;; combine pass from converting sse2_ashlti3 to *ashlti3_1.
 
+(define_insn "*avx_ashlti3"
+  [(set (match_operand:TI 0 "register_operand" "=x")
+	(ashift:TI (match_operand:TI 1 "register_operand" "x")
+		   (match_operand:SI 2 "const_0_to_255_mul_8_operand" "n")))]
+  "TARGET_AVX"
+{
+  operands[2] = GEN_INT (INTVAL (operands[2]) / 8);
+  return "vpslldq\t{%2, %1, %0|%0, %1, %2}";
+}
+  [(set_attr "type" "sseishft")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "sse2_ashlti3"
   [(set (match_operand:TI 0 "register_operand" "=x")
 	(ashift:TI (match_operand:TI 1 "register_operand" "0")
@@ -12561,6 +12758,19 @@
 ;; This pattern must be defined before *lshrti3_1 to prevent
 ;; combine pass from converting sse2_lshrti3 to *lshrti3_1.
 
+(define_insn "*avx_lshrti3"
+  [(set (match_operand:TI 0 "register_operand" "=x")
+ 	(lshiftrt:TI (match_operand:TI 1 "register_operand" "x")
+		     (match_operand:SI 2 "const_0_to_255_mul_8_operand" "n")))]
+  "TARGET_AVX"
+{
+  operands[2] = GEN_INT (INTVAL (operands[2]) / 8);
+  return "vpsrldq\t{%2, %1, %0|%0, %1, %2}";
+}
+  [(set_attr "type" "sseishft")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "sse2_lshrti3"
   [(set (match_operand:TI 0 "register_operand" "=x")
  	(lshiftrt:TI (match_operand:TI 1 "register_operand" "0")
@@ -13935,6 +14145,17 @@
 ;; 0xffffffff is NaN, but not in normalized form, so we can't represent
 ;; it directly.
 
+(define_insn "*avx_setcc<mode>"
+  [(set (match_operand:MODEF 0 "register_operand" "=x")
+	(match_operator:MODEF 1 "avx_comparison_float_operator"
+	  [(match_operand:MODEF 2 "register_operand" "x")
+	   (match_operand:MODEF 3 "nonimmediate_operand" "xm")]))]
+  "TARGET_AVX"
+  "vcmp%D1s<ssemodefsuffix>\t{%3, %2, %0|%0, %2, %3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*sse_setcc<mode>"
   [(set (match_operand:MODEF 0 "register_operand" "=x")
 	(match_operator:MODEF 1 "sse_comparison_operator"
@@ -16013,6 +16234,26 @@
 ;; Gcc is slightly more smart about handling normal two address instructions
 ;; so use special patterns for add and mull.
 
+(define_insn "*fop_<mode>_comm_mixed_avx"
+  [(set (match_operand:MODEF 0 "register_operand" "=f,x")
+	(match_operator:MODEF 3 "binary_fp_operator"
+	  [(match_operand:MODEF 1 "nonimmediate_operand" "%0,x")
+	   (match_operand:MODEF 2 "nonimmediate_operand" "fm,xm")]))]
+  "AVX_FLOAT_MODE_P (<MODE>mode) && TARGET_MIX_SSE_I387
+   && COMMUTATIVE_ARITH_P (operands[3])
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "* return output_387_binary_op (insn, operands);"
+  [(set (attr "type")
+	(if_then_else (eq_attr "alternative" "1")
+	   (if_then_else (match_operand:MODEF 3 "mult_operator" "")
+	      (const_string "ssemul")
+	      (const_string "sseadd"))
+	   (if_then_else (match_operand:MODEF 3 "mult_operator" "")
+	      (const_string "fmul")
+	      (const_string "fop"))))
+   (set_attr "prefix" "orig,maybe_vex")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*fop_<mode>_comm_mixed"
   [(set (match_operand:MODEF 0 "register_operand" "=f,x")
 	(match_operator:MODEF 3 "binary_fp_operator"
@@ -16032,6 +16273,22 @@
 	      (const_string "fop"))))
    (set_attr "mode" "<MODE>")])
 
+(define_insn "*fop_<mode>_comm_avx"
+  [(set (match_operand:MODEF 0 "register_operand" "=x")
+	(match_operator:MODEF 3 "binary_fp_operator"
+	  [(match_operand:MODEF 1 "nonimmediate_operand" "%x")
+	   (match_operand:MODEF 2 "nonimmediate_operand" "xm")]))]
+  "AVX_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
+   && COMMUTATIVE_ARITH_P (operands[3])
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "* return output_387_binary_op (insn, operands);"
+  [(set (attr "type")
+        (if_then_else (match_operand:MODEF 3 "mult_operator" "")
+	   (const_string "ssemul")
+	   (const_string "sseadd")))
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*fop_<mode>_comm_sse"
   [(set (match_operand:MODEF 0 "register_operand" "=x")
 	(match_operator:MODEF 3 "binary_fp_operator"
@@ -16062,6 +16319,33 @@
 	   (const_string "fop")))
    (set_attr "mode" "<MODE>")])
 
+(define_insn "*fop_<mode>_1_mixed_avx"
+  [(set (match_operand:MODEF 0 "register_operand" "=f,f,x")
+	(match_operator:MODEF 3 "binary_fp_operator"
+	  [(match_operand:MODEF 1 "nonimmediate_operand" "0,fm,x")
+	   (match_operand:MODEF 2 "nonimmediate_operand" "fm,0,xm")]))]
+  "AVX_FLOAT_MODE_P (<MODE>mode) && TARGET_MIX_SSE_I387
+   && !COMMUTATIVE_ARITH_P (operands[3])
+   && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "* return output_387_binary_op (insn, operands);"
+  [(set (attr "type")
+        (cond [(and (eq_attr "alternative" "2")
+	            (match_operand:MODEF 3 "mult_operator" ""))
+                 (const_string "ssemul")
+	       (and (eq_attr "alternative" "2")
+	            (match_operand:MODEF 3 "div_operator" ""))
+                 (const_string "ssediv")
+	       (eq_attr "alternative" "2")
+                 (const_string "sseadd")
+	       (match_operand:MODEF 3 "mult_operator" "")
+                 (const_string "fmul")
+               (match_operand:MODEF 3 "div_operator" "")
+                 (const_string "fdiv")
+              ]
+              (const_string "fop")))
+   (set_attr "prefix" "orig,orig,maybe_vex")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*fop_<mode>_1_mixed"
   [(set (match_operand:MODEF 0 "register_operand" "=f,f,x")
 	(match_operator:MODEF 3 "binary_fp_operator"
@@ -16093,10 +16377,29 @@
 	(unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "xm")]
 		   UNSPEC_RCP))]
   "TARGET_SSE_MATH"
-  "rcpss\t{%1, %0|%0, %1}"
+  "%vrcpss\t{%1, %d0|%d0, %1}"
   [(set_attr "type" "sse")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "SF")])
 
+(define_insn "*fop_<mode>_1_avx"
+  [(set (match_operand:MODEF 0 "register_operand" "=x")
+	(match_operator:MODEF 3 "binary_fp_operator"
+	  [(match_operand:MODEF 1 "register_operand" "x")
+	   (match_operand:MODEF 2 "nonimmediate_operand" "xm")]))]
+  "AVX_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH
+   && !COMMUTATIVE_ARITH_P (operands[3])"
+  "* return output_387_binary_op (insn, operands);"
+  [(set (attr "type")
+        (cond [(match_operand:MODEF 3 "mult_operator" "")
+                 (const_string "ssemul")
+	       (match_operand:MODEF 3 "div_operator" "")
+                 (const_string "ssediv")
+              ]
+              (const_string "sseadd")))
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*fop_<mode>_1_sse"
   [(set (match_operand:MODEF 0 "register_operand" "=x")
 	(match_operator:MODEF 3 "binary_fp_operator"
@@ -16425,8 +16728,9 @@
 	(unspec:SF [(match_operand:SF 1 "nonimmediate_operand" "xm")]
 		   UNSPEC_RSQRT))]
   "TARGET_SSE_MATH"
-  "rsqrtss\t{%1, %0|%0, %1}"
+  "%vrsqrtss\t{%1, %d0|%d0, %1}"
   [(set_attr "type" "sse")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "SF")])
 
 (define_expand "rsqrtsf2"
@@ -16444,8 +16748,9 @@
 	(sqrt:MODEF
 	  (match_operand:MODEF 1 "nonimmediate_operand" "xm")))]
   "SSE_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH"
-  "sqrts<ssemodefsuffix>\t{%1, %0|%0, %1}"
+  "%vsqrts<ssemodefsuffix>\t{%1, %d0|%d0, %1}"
   [(set_attr "type" "sse")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "<MODE>")
    (set_attr "athlon_decode" "*")
    (set_attr "amdfam10_decode" "*")])
@@ -17588,9 +17893,10 @@
 		       (match_operand:SI 2 "const_0_to_15_operand" "n")]
 		      UNSPEC_ROUND))]
   "TARGET_ROUND"
-  "rounds<ssemodefsuffix>\t{%2, %1, %0|%0, %1, %2}"
+  "%vrounds<ssemodefsuffix>\t{%2, %1, %d0|%d0, %1, %2}"
   [(set_attr "type" "ssecvt")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "<MODE>")])
 
 (define_insn "rintxf2"
@@ -19691,6 +19997,17 @@
 ;; Since both the tree-level MAX_EXPR and the rtl-level SMAX operator
 ;; are undefined in this condition, we're certain this is correct.
 
+(define_insn "*avx_<code><mode>3"
+  [(set (match_operand:MODEF 0 "register_operand" "=x")
+	(smaxmin:MODEF
+	  (match_operand:MODEF 1 "nonimmediate_operand" "%x")
+	  (match_operand:MODEF 2 "nonimmediate_operand" "xm")))]
+  "AVX_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH"
+  "v<maxminfprefix>s<ssemodefsuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "<code><mode>3"
   [(set (match_operand:MODEF 0 "register_operand" "=x")
 	(smaxmin:MODEF
@@ -19707,6 +20024,18 @@
 ;; Their operands are not commutative, and thus they may be used in the
 ;; presence of -0.0 and NaN.
 
+(define_insn "*avx_ieee_smin<mode>3"
+  [(set (match_operand:MODEF 0 "register_operand" "=x")
+	(unspec:MODEF
+	  [(match_operand:MODEF 1 "register_operand" "x")
+	   (match_operand:MODEF 2 "nonimmediate_operand" "xm")]
+	 UNSPEC_IEEE_MIN))]
+  "AVX_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH"
+  "vmins<ssemodefsuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*ieee_smin<mode>3"
   [(set (match_operand:MODEF 0 "register_operand" "=x")
 	(unspec:MODEF
@@ -19718,6 +20047,18 @@
   [(set_attr "type" "sseadd")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "*avx_ieee_smax<mode>3"
+  [(set (match_operand:MODEF 0 "register_operand" "=x")
+	(unspec:MODEF
+	  [(match_operand:MODEF 1 "register_operand" "0")
+	   (match_operand:MODEF 2 "nonimmediate_operand" "xm")]
+	 UNSPEC_IEEE_MAX))]
+  "AVX_FLOAT_MODE_P (<MODE>mode) && TARGET_SSE_MATH"
+  "vmaxs<ssemodefsuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*ieee_smax<mode>3"
   [(set (match_operand:MODEF 0 "register_operand" "=x")
 	(unspec:MODEF
@@ -21185,6 +21526,10 @@
   int i;
   operands[0] = gen_rtx_MEM (Pmode,
 			     gen_rtx_PLUS (Pmode, operands[0], operands[4]));
+  /* VEX instruction with a REX prefix will #UD.  */
+  if (TARGET_AVX && GET_CODE (XEXP (operands[0], 0)) != PLUS)
+    gcc_unreachable ();
+
   output_asm_insn ("jmp\t%A1", operands);
   for (i = X86_64_SSE_REGPARM_MAX - 1; i >= INTVAL (operands[2]); i--)
     {
@@ -21193,7 +21538,7 @@
       PUT_MODE (operands[4], TImode);
       if (GET_CODE (XEXP (operands[0], 0)) != PLUS)
         output_asm_insn ("rex", operands);
-      output_asm_insn ("movaps\t{%5, %4|%4, %5}", operands);
+      output_asm_insn ("%vmovaps\t{%5, %4|%4, %5}", operands);
     }
   (*targetm.asm_out.internal_label) (asm_out_file, "L",
 				     CODE_LABEL_NUMBER (operands[3]));
@@ -21202,9 +21547,14 @@
   [(set_attr "type" "other")
    (set_attr "length_immediate" "0")
    (set_attr "length_address" "0")
-   (set_attr "length" "34")
+   (set (attr "length")
+     (if_then_else
+       (eq (symbol_ref "TARGET_AVX") (const_int 0))
+       (const_string "34")
+       (const_string "42")))
    (set_attr "memory" "store")
    (set_attr "modrm" "0")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "DI")])
 
 (define_expand "prefetch"
diff --git a/gcc/config/i386/i386.opt b/gcc/config/i386/i386.opt
index fc59b77..f362dde 100644
--- a/gcc/config/i386/i386.opt
+++ b/gcc/config/i386/i386.opt
@@ -299,6 +299,14 @@ mno-sse4
 Target RejectNegative Report InverseMask(ISA_SSE4_1) MaskExists Var(ix86_isa_flags) VarExists Save
 Do not support SSE4.1 and SSE4.2 built-in functions and code generation
 
+mavx
+Target Report Mask(ISA_AVX) Var(ix86_isa_flags) VarExists
+Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2 and AVX built-in functions and code generation
+
+mfma
+Target Report Mask(ISA_FMA) Var(ix86_isa_flags) VarExists
+Support MMX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2, AVX and FMA built-in functions and code generation
+
 msse4a
 Target Report Mask(ISA_SSE4A) Var(ix86_isa_flags) VarExists Save
 Support MMX, SSE, SSE2, SSE3 and SSE4A built-in functions and code generation
diff --git a/gcc/config/i386/mmx.md b/gcc/config/i386/mmx.md
index 9bc6758..16aaf2c 100644
--- a/gcc/config/i386/mmx.md
+++ b/gcc/config/i386/mmx.md
@@ -78,15 +78,45 @@
     movq\t{%1, %0|%0, %1}
     movdq2q\t{%1, %0|%0, %1}
     movq2dq\t{%1, %0|%0, %1}
-    pxor\t%0, %0
-    movq\t{%1, %0|%0, %1}
-    movq\t{%1, %0|%0, %1}
-    movd\t{%1, %0|%0, %1}
-    movd\t{%1, %0|%0, %1}"
+    %vpxor\t%0, %d0
+    %vmovq\t{%1, %0|%0, %1}
+    %vmovq\t{%1, %0|%0, %1}
+    %vmovq\t{%1, %0|%0, %1}
+    %vmovq\t{%1, %0|%0, %1}"
   [(set_attr "type" "imov,imov,mmx,mmxmov,mmxmov,ssecvt,ssecvt,sselog1,ssemov,ssemov,ssemov,ssemov")
    (set_attr "unit" "*,*,*,*,*,mmx,mmx,*,*,*,*,*")
+   (set (attr "prefix")
+     (if_then_else (eq_attr "alternative" "7,8,9,10,11")
+       (const_string "maybe_vex")
+       (const_string "orig")))
    (set_attr "mode" "DI")])
 
+(define_insn "*mov<mode>_internal_avx"
+  [(set (match_operand:MMXMODEI8 0 "nonimmediate_operand"
+			"=!?y,!?y,m  ,!y ,*Y2,*Y2,*Y2 ,m  ,r  ,m")
+	(match_operand:MMXMODEI8 1 "vector_move_operand"
+			"C   ,!ym,!?y,*Y2,!y ,C  ,*Y2m,*Y2,irm,r"))]
+  "TARGET_AVX
+   && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "@
+    pxor\t%0, %0
+    movq\t{%1, %0|%0, %1}
+    movq\t{%1, %0|%0, %1}
+    movdq2q\t{%1, %0|%0, %1}
+    movq2dq\t{%1, %0|%0, %1}
+    vpxor\t%0, %0, %0
+    vmovq\t{%1, %0|%0, %1}
+    vmovq\t{%1, %0|%0, %1}
+    #
+    #"
+  [(set_attr "type" "mmx,mmxmov,mmxmov,ssecvt,ssecvt,sselog1,ssemov,ssemov,*,*")
+   (set_attr "unit" "*,*,*,mmx,mmx,*,*,*,*,*")
+   (set (attr "prefix")
+     (if_then_else (eq_attr "alternative" "5,6,7")
+       (const_string "vex")
+       (const_string "orig")))
+   (set_attr "mode" "DI,DI,DI,DI,DI,TI,DI,DI,DI,DI")])
+
 (define_insn "*mov<mode>_internal"
   [(set (match_operand:MMXMODEI8 0 "nonimmediate_operand"
 			"=!?y,!?y,m  ,!y ,*Y2,*Y2,*Y2 ,m  ,*x,*x,*x,m ,r  ,m")
@@ -122,6 +152,35 @@
   DONE;
 })
 
+(define_insn "*movv2sf_internal_rex64_avx"
+  [(set (match_operand:V2SF 0 "nonimmediate_operand"
+				"=rm,r ,!?y,!?y ,m ,!y,Y2,x,x,x,m,r,x")
+        (match_operand:V2SF 1 "vector_move_operand"
+				"Cr ,m ,C  ,!?ym,!y,Y2,!y,C,x,m,x,x,r"))]
+  "TARGET_64BIT && TARGET_AVX
+   && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "@
+    mov{q}\t{%1, %0|%0, %1}
+    mov{q}\t{%1, %0|%0, %1}
+    pxor\t%0, %0
+    movq\t{%1, %0|%0, %1}
+    movq\t{%1, %0|%0, %1}
+    movdq2q\t{%1, %0|%0, %1}
+    movq2dq\t{%1, %0|%0, %1}
+    vxorps\t%0, %0, %0
+    vmovaps\t{%1, %0|%0, %1}
+    vmovlps\t{%1, %0, %0|%0, %0, %1}
+    vmovlps\t{%1, %0|%0, %1}
+    vmovq\t{%1, %0|%0, %1}
+    vmovq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "imov,imov,mmx,mmxmov,mmxmov,ssecvt,ssecvt,ssemov,sselog1,ssemov,ssemov,ssemov,ssemov")
+   (set_attr "unit" "*,*,*,*,*,mmx,mmx,*,*,*,*,*,*")
+   (set (attr "prefix")
+     (if_then_else (eq_attr "alternative" "7,8,9,10,11,12")
+       (const_string "vex")
+       (const_string "orig")))
+   (set_attr "mode" "DI,DI,DI,DI,DI,DI,DI,V4SF,V4SF,V2SF,V2SF,DI,DI")])
+
 (define_insn "*movv2sf_internal_rex64"
   [(set (match_operand:V2SF 0 "nonimmediate_operand"
 				"=rm,r ,!?y,!?y ,m ,!y,*Y2,x,x,x,m,r,Yi")
@@ -147,6 +206,33 @@
    (set_attr "unit" "*,*,*,*,*,mmx,mmx,*,*,*,*,*,*")
    (set_attr "mode" "DI,DI,DI,DI,DI,DI,DI,V4SF,V4SF,V2SF,V2SF,DI,DI")])
 
+(define_insn "*movv2sf_internal_avx"
+  [(set (match_operand:V2SF 0 "nonimmediate_operand"
+			"=!?y,!?y ,m  ,!y ,*Y2,*x,*x,*x,m ,r  ,m")
+        (match_operand:V2SF 1 "vector_move_operand"
+			"C   ,!?ym,!?y,*Y2,!y ,C ,*x,m ,*x,irm,r"))]
+  "TARGET_AVX
+   && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "@
+    pxor\t%0, %0
+    movq\t{%1, %0|%0, %1}
+    movq\t{%1, %0|%0, %1}
+    movdq2q\t{%1, %0|%0, %1}
+    movq2dq\t{%1, %0|%0, %1}
+    vxorps\t%0, %0, %0
+    vmovaps\t{%1, %0|%0, %1}
+    vmovlps\t{%1, %0, %0|%0, %0, %1}
+    vmovlps\t{%1, %0|%0, %1}
+    #
+    #"
+  [(set_attr "type" "mmx,mmxmov,mmxmov,ssecvt,ssecvt,sselog1,ssemov,ssemov,ssemov,*,*")
+   (set_attr "unit" "*,*,*,mmx,mmx,*,*,*,*,*,*")
+   (set (attr "prefix")
+     (if_then_else (eq_attr "alternative" "5,6,7,8")
+       (const_string "vex")
+       (const_string "orig")))
+   (set_attr "mode" "DI,DI,DI,DI,DI,V4SF,V4SF,V2SF,V2SF,DI,DI")])
+
 (define_insn "*movv2sf_internal"
   [(set (match_operand:V2SF 0 "nonimmediate_operand"
 			"=!?y,!?y ,m  ,!y ,*Y2,*x,*x,*x,m ,r  ,m")
diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md
index f36c6d4..36a1b3a 100644
--- a/gcc/config/i386/predicates.md
+++ b/gcc/config/i386/predicates.md
@@ -640,11 +640,31 @@
   (and (match_code "const_int")
        (match_test "IN_RANGE (INTVAL (op), 2, 3)")))
 
+;; Match 4 to 5.
+(define_predicate "const_4_to_5_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 4, 5)")))
+
 ;; Match 4 to 7.
 (define_predicate "const_4_to_7_operand"
   (and (match_code "const_int")
        (match_test "IN_RANGE (INTVAL (op), 4, 7)")))
 
+;; Match 6 to 7.
+(define_predicate "const_6_to_7_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 6, 7)")))
+
+;; Match 8 to 11.
+(define_predicate "const_8_to_11_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 8, 11)")))
+
+;; Match 12 to 15.
+(define_predicate "const_12_to_15_operand"
+  (and (match_code "const_int")
+       (match_test "IN_RANGE (INTVAL (op), 12, 15)")))
+
 ;; Match exactly one bit in 2-bit mask.
 (define_predicate "const_pow2_1_to_2_operand"
   (and (match_code "const_int")
@@ -914,6 +934,11 @@
 (define_special_predicate "sse_comparison_operator"
   (match_code "eq,lt,le,unordered,ne,unge,ungt,ordered"))
 
+;; Return 1 if OP is a comparison operator that can be issued by
+;; avx predicate generation instructions
+(define_predicate "avx_comparison_float_operator"
+  (match_code "ne,eq,ge,gt,le,lt,unordered,ordered,uneq,unge,ungt,unle,unlt,ltgt"))
+
 ;; Return 1 if OP is a comparison operator that can be issued by sse predicate
 ;; generation instructions
 (define_predicate "sse5_comparison_float_operator"
@@ -1057,3 +1082,15 @@
 (define_predicate "misaligned_operand"
   (and (match_code "mem")
        (match_test "MEM_ALIGN (op) < GET_MODE_ALIGNMENT (mode)")))
+
+;; Return 1 if OP is a vzeroall operation, known to be a PARALLEL.
+(define_predicate "vzeroall_operation"
+  (match_code "parallel")
+{
+  int nregs = TARGET_64BIT ? 16 : 8;
+
+  if (XVECLEN (op, 0) != nregs + 1)
+    return 0;
+
+  return 1;
+})
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 72098ec..208a530 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -26,6 +26,18 @@
 ;; All 16-byte vector modes handled by SSE
 (define_mode_iterator SSEMODE [V16QI V8HI V4SI V2DI V4SF V2DF])
 
+;; 32 byte integral vector modes handled by AVX
+(define_mode_iterator AVX256MODEI [V32QI V16HI V8SI V4DI])
+
+;; All 32-byte vector modes handled by AVX
+(define_mode_iterator AVX256MODE [V32QI V16HI V8SI V4DI V8SF V4DF])
+
+;; All QI vector modes handled by AVX
+(define_mode_iterator AVXMODEQI [V32QI V16QI])
+
+;; All vector modes handled by AVX
+(define_mode_iterator AVXMODE [V16QI V8HI V4SI V2DI V4SF V2DF V32QI V16HI V8SI V4DI V8SF V4DF])
+
 ;; Mix-n-match
 (define_mode_iterator SSEMODE12 [V16QI V8HI])
 (define_mode_iterator SSEMODE24 [V8HI V4SI])
@@ -36,6 +48,15 @@
 (define_mode_iterator SSEMODEF4 [SF DF V4SF V2DF])
 (define_mode_iterator SSEMODEF2P [V4SF V2DF])
 
+(define_mode_iterator AVX256MODEF2P [V8SF V4DF])
+(define_mode_iterator AVX256MODE2P [V8SI V8SF V4DF])
+(define_mode_iterator AVX256MODE4P [V4DI V4DF])
+(define_mode_iterator AVX256MODE8P [V8SI V8SF])
+(define_mode_iterator AVXMODEF2P [V4SF V2DF V8SF V4DF])
+(define_mode_iterator AVXMODEF4P [V4SF V4DF])
+(define_mode_iterator AVXMODEDCVTDQ2PS [V4SF V8SF])
+(define_mode_iterator AVXMODEDCVTPS2DQ [V4SI V8SI])
+
 ;; Int-float size matches
 (define_mode_iterator SSEMODE4S [V4SF V4SI])
 (define_mode_iterator SSEMODE2D [V2DF V2DI])
@@ -70,8 +91,45 @@
 				(V16QI "16") (V8HI "8")
 				(V4SI "4") (V2DI "2")])
 
+;; Mapping for AVX
+(define_mode_attr avxvecmode
+  [(V16QI "TI") (V8HI "TI") (V4SI "TI") (V2DI "TI") (V4SF "V4SF")
+   (V2DF "V2DF") (V32QI "OI") (V16HI "OI") (V8SI "OI") (V4DI "OI")
+   (V8SF "V8SF") (V4DF "V4DF")])
+(define_mode_attr avxvecpsmode
+  [(V16QI "V4SF") (V8HI "V4SF") (V4SI "V4SF") (V2DI "V4SF")
+   (V32QI "V8SF") (V16HI "V8SF") (V8SI "V8SF") (V4DI "V8SF")])
+(define_mode_attr avxhalfvecmode
+  [(V4SF "V2SF") (V32QI "V16QI")  (V16HI "V8HI") (V8SI "V4SI")
+   (V4DI "V2DI") (V8SF "V4SF") (V4DF "V2DF")])
+(define_mode_attr avxscalarmode
+  [(V16QI "QI") (V8HI "HI") (V4SI "SI") (V4SF "SF") (V2DF "DF")
+   (V8SF "SF") (V4DF "DF")])
+(define_mode_attr avxcvtvecmode
+  [(V4SF "V4SI") (V8SF "V8SI") (V4SI "V4SF") (V8SI "V8SF")])
+(define_mode_attr avxpermvecmode
+  [(V2DF "V2DI") (V4SF "V4SI") (V4DF "V4DI") (V8SF "V8SI")])
+(define_mode_attr avxmodesuffixf2c
+  [(V4SF "s") (V2DF "d") (V8SF "s") (V4DF "d")])
+(define_mode_attr avxmodesuffixp
+ [(V2DF "pd") (V4SI "si") (V4SF "ps") (V8SF "ps") (V8SI "si")
+  (V4DF "pd")])
+(define_mode_attr avxmodesuffixs
+ [(V16QI "b") (V8HI "w") (V4SI "d")])
+(define_mode_attr avxmodesuffix
+  [(V16QI "") (V32QI "256") (V4SI "") (V4SF "") (V2DF "")
+   (V8SI "256") (V8SF "256") (V4DF "256")])
+
 ;; Mapping of immediate bits for blend instructions
-(define_mode_attr blendbits [(V4SF "15") (V2DF "3")])
+(define_mode_attr blendbits
+  [(V8SF "255") (V4SF "15") (V4DF "15") (V2DF "3")])
+
+;; Mapping of immediate bits for vpermil instructions
+(define_mode_attr vpermilbits
+  [(V8SF "255") (V4SF "255") (V4DF "15") (V2DF "3")])
+
+;; Mapping of immediate bits for pinsr instructions
+(define_mode_attr pinsrbits [(V16QI "32768") (V8HI "128") (V4SI "8")])
 
 ;; Patterns whose name begins with "sse{,2,3}_" are invoked by intrinsics.
 
@@ -81,6 +139,47 @@
 ;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+(define_expand "mov<mode>"
+  [(set (match_operand:AVX256MODE 0 "nonimmediate_operand" "")
+	(match_operand:AVX256MODE 1 "nonimmediate_operand" ""))]
+  "TARGET_AVX"
+{
+  ix86_expand_vector_move (<MODE>mode, operands);
+  DONE;
+})
+
+(define_insn "*avx_mov<mode>_internal"
+  [(set (match_operand:AVXMODE 0 "nonimmediate_operand" "=x,x ,m")
+	(match_operand:AVXMODE 1 "nonimmediate_or_sse_const_operand"  "C ,xm,x"))]
+  "TARGET_AVX
+   && (register_operand (operands[0], <MODE>mode)
+       || register_operand (operands[1], <MODE>mode))"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return standard_sse_constant_opcode (insn, operands[1]);
+    case 1:
+    case 2:
+      switch (get_attr_mode (insn))
+        {
+	case MODE_V8SF:
+	case MODE_V4SF:
+	  return "vmovaps\t{%1, %0|%0, %1}";
+	case MODE_V4DF:
+	case MODE_V2DF:
+	  return "vmovapd\t{%1, %0|%0, %1}";
+	default:
+	  return "vmovdqa\t{%1, %0|%0, %1}";
+	}
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "sselog1,ssemov,ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<avxvecmode>")])
+
 ;; All of these patterns are enabled for SSE1 as well as SSE2.
 ;; This is essential for maintaining stable calling conventions.
 
@@ -194,6 +293,14 @@
 })
 
 (define_expand "push<mode>1"
+  [(match_operand:AVX256MODE 0 "register_operand" "")]
+  "TARGET_AVX"
+{
+  ix86_expand_push (<MODE>mode, operands[0]);
+  DONE;
+})
+
+(define_expand "push<mode>1"
   [(match_operand:SSEMODE 0 "register_operand" "")]
   "TARGET_SSE"
 {
@@ -202,6 +309,15 @@
 })
 
 (define_expand "movmisalign<mode>"
+  [(set (match_operand:AVX256MODE 0 "nonimmediate_operand" "")
+	(match_operand:AVX256MODE 1 "nonimmediate_operand" ""))]
+  "TARGET_AVX"
+{
+  ix86_expand_vector_move_misalign (<MODE>mode, operands);
+  DONE;
+})
+
+(define_expand "movmisalign<mode>"
   [(set (match_operand:SSEMODE 0 "nonimmediate_operand" "")
 	(match_operand:SSEMODE 1 "nonimmediate_operand" ""))]
   "TARGET_SSE"
@@ -210,6 +326,18 @@
   DONE;
 })
 
+(define_insn "avx_movup<avxmodesuffixf2c><avxmodesuffix>"
+  [(set (match_operand:AVXMODEF2P 0 "nonimmediate_operand" "=x,m")
+	(unspec:AVXMODEF2P
+	  [(match_operand:AVXMODEF2P 1 "nonimmediate_operand" "xm,x")]
+	  UNSPEC_MOVU))]
+  "AVX_VEC_FLOAT_MODE_P (<MODE>mode)
+   && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "vmovup<avxmodesuffixf2c>\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "sse2_movq128"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
 	(vec_concat:V2DI
@@ -218,8 +346,9 @@
 	    (parallel [(const_int 0)]))
 	  (const_int 0)))]
   "TARGET_SSE2"
-  "movq\t{%1, %0|%0, %1}"
+  "%vmovq\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "<sse>_movup<ssemodesuffixf2c>"
@@ -233,6 +362,17 @@
   [(set_attr "type" "ssemov")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "avx_movdqu<avxmodesuffix>"
+  [(set (match_operand:AVXMODEQI 0 "nonimmediate_operand" "=x,m")
+	(unspec:AVXMODEQI
+	  [(match_operand:AVXMODEQI 1 "nonimmediate_operand" "xm,x")]
+	  UNSPEC_MOVU))]
+  "TARGET_AVX && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "vmovdqu\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<avxvecmode>")])
+
 (define_insn "sse2_movdqu"
   [(set (match_operand:V16QI 0 "nonimmediate_operand" "=x,m")
 	(unspec:V16QI [(match_operand:V16QI 1 "nonimmediate_operand" "xm,x")]
@@ -249,8 +389,9 @@
 	  [(match_operand:SSEMODEF2P 1 "register_operand" "x")]
 	  UNSPEC_MOVNT))]
   "SSE_VEC_FLOAT_MODE_P (<MODE>mode)"
-  "movntp<ssemodesuffixf2c>\t{%1, %0|%0, %1}"
+  "%vmovntp<ssemodesuffixf2c>\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "<MODE>")])
 
 (define_insn "sse2_movntv2di"
@@ -258,9 +399,10 @@
 	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "x")]
 		     UNSPEC_MOVNT))]
   "TARGET_SSE2"
-  "movntdq\t{%1, %0|%0, %1}"
+  "%vmovntdq\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
    (set_attr "prefix_data16" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "sse2_movntsi"
@@ -272,6 +414,17 @@
   [(set_attr "type" "ssecvt")
    (set_attr "mode" "V2DF")])
 
+(define_insn "avx_lddqu<avxmodesuffix>"
+  [(set (match_operand:AVXMODEQI 0 "register_operand" "=x")
+	(unspec:AVXMODEQI
+	  [(match_operand:AVXMODEQI 1 "memory_operand" "m")]
+	  UNSPEC_LDDQU))]
+  "TARGET_AVX"
+  "vlddqu\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<avxvecmode>")])
+
 (define_insn "sse3_lddqu"
   [(set (match_operand:V16QI 0 "register_operand" "=x")
 	(unspec:V16QI [(match_operand:V16QI 1 "memory_operand" "m")]
@@ -330,6 +483,26 @@
   "ix86_expand_fp_absneg_operator (<CODE>, <MODE>mode, operands); DONE;")
 
 (define_expand "<plusminus_insn><mode>3"
+  [(set (match_operand:AVX256MODEF2P 0 "register_operand" "")
+	(plusminus:AVX256MODEF2P
+	  (match_operand:AVX256MODEF2P 1 "nonimmediate_operand" "")
+	  (match_operand:AVX256MODEF2P 2 "nonimmediate_operand" "")))]
+  "AVX256_VEC_FLOAT_MODE_P (<MODE>mode)"
+  "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
+
+(define_insn "*avx_<plusminus_insn><mode>3"
+  [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x")
+	(plusminus:AVXMODEF2P
+	  (match_operand:AVXMODEF2P 1 "nonimmediate_operand" "<comm>x")
+	  (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")))]
+  "AVX_VEC_FLOAT_MODE_P (<MODE>mode)
+   && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+  "v<plusminus_mnemonic>p<avxmodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<avxvecmode>")])
+
+(define_expand "<plusminus_insn><mode>3"
   [(set (match_operand:SSEMODEF2P 0 "register_operand" "")
 	(plusminus:SSEMODEF2P
 	  (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "")
@@ -348,6 +521,20 @@
   [(set_attr "type" "sseadd")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "*avx_vm<plusminus_insn><mode>3"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x")
+	(vec_merge:SSEMODEF2P
+	  (plusminus:SSEMODEF2P
+	    (match_operand:SSEMODEF2P 1 "register_operand" "x")
+	    (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm"))
+	  (match_dup 1)
+	  (const_int 1)))]
+  "AVX128_VEC_FLOAT_MODE_P (<MODE>mode)"
+  "v<plusminus_mnemonic>s<ssemodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<ssescalarmode>")])
+
 (define_insn "<sse>_vm<plusminus_insn><mode>3"
   [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x")
 	(vec_merge:SSEMODEF2P
@@ -362,6 +549,26 @@
    (set_attr "mode" "<ssescalarmode>")])
 
 (define_expand "mul<mode>3"
+  [(set (match_operand:AVX256MODEF2P 0 "register_operand" "")
+	(mult:AVX256MODEF2P
+	  (match_operand:AVX256MODEF2P 1 "nonimmediate_operand" "")
+	  (match_operand:AVX256MODEF2P 2 "nonimmediate_operand" "")))]
+  "AVX256_VEC_FLOAT_MODE_P (<MODE>mode)"
+  "ix86_fixup_binary_operands_no_copy (MULT, <MODE>mode, operands);")
+
+(define_insn "*avx_mul<mode>3"
+  [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x")
+	(mult:AVXMODEF2P
+	  (match_operand:AVXMODEF2P 1 "nonimmediate_operand" "%x")
+	  (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")))]
+  "AVX_VEC_FLOAT_MODE_P (<MODE>mode)
+   && ix86_binary_operator_ok (MULT, <MODE>mode, operands)"
+  "vmulp<avxmodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssemul")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<avxvecmode>")])
+
+(define_expand "mul<mode>3"
   [(set (match_operand:SSEMODEF2P 0 "register_operand" "")
 	(mult:SSEMODEF2P
 	  (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "")
@@ -380,6 +587,20 @@
   [(set_attr "type" "ssemul")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "*avx_vmmul<mode>3"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x")
+	(vec_merge:SSEMODEF2P
+	  (mult:SSEMODEF2P
+	    (match_operand:SSEMODEF2P 1 "register_operand" "x")
+	    (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm"))
+	  (match_dup 1)
+	  (const_int 1)))]
+  "AVX_VEC_FLOAT_MODE_P (<MODE>mode)"
+  "vmuls<ssemodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssemul")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<ssescalarmode>")])
+
 (define_insn "<sse>_vmmul<mode>3"
   [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x")
 	(vec_merge:SSEMODEF2P
@@ -393,6 +614,42 @@
   [(set_attr "type" "ssemul")
    (set_attr "mode" "<ssescalarmode>")])
 
+(define_expand "divv8sf3"
+  [(set (match_operand:V8SF 0 "register_operand" "")
+	(div:V8SF (match_operand:V8SF 1 "register_operand" "")
+		  (match_operand:V8SF 2 "nonimmediate_operand" "")))]
+  "TARGET_AVX"
+{
+  ix86_fixup_binary_operands_no_copy (DIV, V8SFmode, operands);
+
+  if (TARGET_SSE_MATH && TARGET_RECIP && !optimize_size
+      && flag_finite_math_only && !flag_trapping_math
+      && flag_unsafe_math_optimizations)
+    {
+      ix86_emit_swdivsf (operands[0], operands[1],
+			 operands[2], V8SFmode);
+      DONE;
+    }
+})
+
+(define_expand "divv4df3"
+  [(set (match_operand:V4DF 0 "register_operand" "")
+	(div:V4DF (match_operand:V4DF 1 "register_operand" "")
+		  (match_operand:V4DF 2 "nonimmediate_operand" "")))]
+  "TARGET_AVX"
+  "ix86_fixup_binary_operands_no_copy (DIV, V4DFmode, operands);")
+
+(define_insn "avx_div<mode>3"
+  [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x")
+	(div:AVXMODEF2P
+	  (match_operand:AVXMODEF2P 1 "register_operand" "x")
+	  (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")))]
+  "AVX_VEC_FLOAT_MODE_P (<MODE>mode)"
+  "vdivp<avxmodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssediv")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
 (define_expand "divv4sf3"
   [(set (match_operand:V4SF 0 "register_operand" "")
 	(div:V4SF (match_operand:V4SF 1 "register_operand" "")
@@ -416,6 +673,17 @@
   "TARGET_SSE2"
   "")
 
+(define_insn "*avx_div<mode>3"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x")
+	(div:SSEMODEF2P
+	  (match_operand:SSEMODEF2P 1 "register_operand" "x")
+	  (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm")))]
+  "AVX128_VEC_FLOAT_MODE_P (<MODE>mode)"
+  "vdivp<ssemodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssediv")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "<sse>_div<mode>3"
   [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x")
 	(div:SSEMODEF2P
@@ -426,6 +694,20 @@
   [(set_attr "type" "ssediv")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "*avx_vmdiv<mode>3"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x")
+	(vec_merge:SSEMODEF2P
+	  (div:SSEMODEF2P
+	    (match_operand:SSEMODEF2P 1 "register_operand" "x")
+	    (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm"))
+	  (match_dup 1)
+	  (const_int 1)))]
+  "AVX128_VEC_FLOAT_MODE_P (<MODE>mode)"
+  "vdivs<ssemodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssediv")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<ssescalarmode>")])
+
 (define_insn "<sse>_vmdiv<mode>3"
   [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x")
 	(vec_merge:SSEMODEF2P
@@ -439,15 +721,39 @@
   [(set_attr "type" "ssediv")
    (set_attr "mode" "<ssescalarmode>")])
 
+(define_insn "avx_rcpv8sf2"
+  [(set (match_operand:V8SF 0 "register_operand" "=x")
+	(unspec:V8SF
+	  [(match_operand:V8SF 1 "nonimmediate_operand" "xm")] UNSPEC_RCP))]
+  "TARGET_AVX"
+  "vrcpps\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
 (define_insn "sse_rcpv4sf2"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
 	(unspec:V4SF
 	  [(match_operand:V4SF 1 "nonimmediate_operand" "xm")] UNSPEC_RCP))]
   "TARGET_SSE"
-  "rcpps\t{%1, %0|%0, %1}"
+  "%vrcpps\t{%1, %0|%0, %1}"
   [(set_attr "type" "sse")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "V4SF")])
 
+(define_insn "*avx_vmrcpv4sf2"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+	(vec_merge:V4SF
+	  (unspec:V4SF [(match_operand:V4SF 1 "nonimmediate_operand" "xm")]
+		       UNSPEC_RCP)
+	  (match_operand:V4SF 2 "register_operand" "x")
+	  (const_int 1)))]
+  "TARGET_AVX"
+  "vrcpss\t{%1, %2, %0|%0, %2, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "SF")])
+
 (define_insn "sse_vmrcpv4sf2"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
 	(vec_merge:V4SF
@@ -460,6 +766,29 @@
   [(set_attr "type" "sse")
    (set_attr "mode" "SF")])
 
+(define_expand "sqrtv8sf2"
+  [(set (match_operand:V8SF 0 "register_operand" "")
+	(sqrt:V8SF (match_operand:V8SF 1 "nonimmediate_operand" "")))]
+  "TARGET_AVX"
+{
+  if (TARGET_SSE_MATH && TARGET_RECIP && !optimize_size
+      && flag_finite_math_only && !flag_trapping_math
+      && flag_unsafe_math_optimizations)
+    {
+      ix86_emit_swsqrtsf (operands[0], operands[1], V8SFmode, 0);
+      DONE;
+    }
+})
+
+(define_insn "avx_sqrtv8sf2"
+  [(set (match_operand:V8SF 0 "register_operand" "=x")
+	(sqrt:V8SF (match_operand:V8SF 1 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX"
+  "vsqrtps\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
 (define_expand "sqrtv4sf2"
   [(set (match_operand:V4SF 0 "register_operand" "")
 	(sqrt:V4SF (match_operand:V4SF 1 "nonimmediate_operand" "")))]
@@ -478,18 +807,42 @@
   [(set (match_operand:V4SF 0 "register_operand" "=x")
 	(sqrt:V4SF (match_operand:V4SF 1 "nonimmediate_operand" "xm")))]
   "TARGET_SSE"
-  "sqrtps\t{%1, %0|%0, %1}"
+  "%vsqrtps\t{%1, %0|%0, %1}"
   [(set_attr "type" "sse")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "V4SF")])
 
+(define_insn "sqrtv4df2"
+  [(set (match_operand:V4DF 0 "register_operand" "=x")
+	(sqrt:V4DF (match_operand:V4DF 1 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX"
+  "vsqrtpd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4DF")])
+
 (define_insn "sqrtv2df2"
   [(set (match_operand:V2DF 0 "register_operand" "=x")
 	(sqrt:V2DF (match_operand:V2DF 1 "nonimmediate_operand" "xm")))]
   "TARGET_SSE2"
-  "sqrtpd\t{%1, %0|%0, %1}"
+  "%vsqrtpd\t{%1, %0|%0, %1}"
   [(set_attr "type" "sse")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "V2DF")])
 
+(define_insn "*avx_vmsqrt<mode>2"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x")
+	(vec_merge:SSEMODEF2P
+	  (sqrt:SSEMODEF2P
+	    (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "xm"))
+	  (match_operand:SSEMODEF2P 2 "register_operand" "x")
+	  (const_int 1)))]
+  "AVX_VEC_FLOAT_MODE_P (<MODE>mode)"
+  "vsqrts<ssemodesuffixf2c>\t{%1, %2, %0|%0, %2, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<ssescalarmode>")])
+
 (define_insn "<sse>_vmsqrt<mode>2"
   [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x")
 	(vec_merge:SSEMODEF2P
@@ -502,6 +855,26 @@
   [(set_attr "type" "sse")
    (set_attr "mode" "<ssescalarmode>")])
 
+(define_expand "rsqrtv8sf2"
+  [(set (match_operand:V8SF 0 "register_operand" "")
+	(unspec:V8SF
+	  [(match_operand:V8SF 1 "nonimmediate_operand" "")] UNSPEC_RSQRT))]
+  "TARGET_AVX && TARGET_SSE_MATH"
+{
+  ix86_emit_swsqrtsf (operands[0], operands[1], V8SFmode, 1);
+  DONE;
+})
+
+(define_insn "avx_rsqrtv8sf2"
+  [(set (match_operand:V8SF 0 "register_operand" "=x")
+	(unspec:V8SF
+	  [(match_operand:V8SF 1 "nonimmediate_operand" "xm")] UNSPEC_RSQRT))]
+  "TARGET_AVX"
+  "vrsqrtps\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
 (define_expand "rsqrtv4sf2"
   [(set (match_operand:V4SF 0 "register_operand" "")
 	(unspec:V4SF
@@ -517,10 +890,24 @@
 	(unspec:V4SF
 	  [(match_operand:V4SF 1 "nonimmediate_operand" "xm")] UNSPEC_RSQRT))]
   "TARGET_SSE"
-  "rsqrtps\t{%1, %0|%0, %1}"
+  "%vrsqrtps\t{%1, %0|%0, %1}"
   [(set_attr "type" "sse")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "V4SF")])
 
+(define_insn "*avx_vmrsqrtv4sf2"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+	(vec_merge:V4SF
+	  (unspec:V4SF [(match_operand:V4SF 1 "nonimmediate_operand" "xm")]
+		       UNSPEC_RSQRT)
+	  (match_operand:V4SF 2 "register_operand" "x")
+	  (const_int 1)))]
+  "TARGET_AVX"
+  "vrsqrtss\t{%1, %2, %0|%0, %2, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "SF")])
+
 (define_insn "sse_vmrsqrtv4sf2"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
 	(vec_merge:V4SF
@@ -538,6 +925,18 @@
 ;; applied to NaNs.  Hopefully the optimizers won't get too smart on us.
 
 (define_expand "<code><mode>3"
+  [(set (match_operand:AVX256MODEF2P 0 "register_operand" "")
+	(smaxmin:AVX256MODEF2P
+	  (match_operand:AVX256MODEF2P 1 "nonimmediate_operand" "")
+	  (match_operand:AVX256MODEF2P 2 "nonimmediate_operand" "")))]
+  "AVX256_VEC_FLOAT_MODE_P (<MODE>mode)"
+{
+  if (!flag_finite_math_only)
+    operands[1] = force_reg (<MODE>mode, operands[1]);
+  ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);
+})
+
+(define_expand "<code><mode>3"
   [(set (match_operand:SSEMODEF2P 0 "register_operand" "")
 	(smaxmin:SSEMODEF2P
 	  (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "")
@@ -549,6 +948,18 @@
   ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);
 })
 
+(define_insn "*avx_<code><mode>3_finite"
+  [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x")
+	(smaxmin:AVXMODEF2P
+	  (match_operand:AVXMODEF2P 1 "nonimmediate_operand" "%x")
+	  (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")))]
+  "AVX_VEC_FLOAT_MODE_P (<MODE>mode) && flag_finite_math_only
+   && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+  "v<maxminfprefix>p<ssemodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "*<code><mode>3_finite"
   [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x")
 	(smaxmin:SSEMODEF2P
@@ -560,6 +971,17 @@
   [(set_attr "type" "sseadd")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "*avx_<code><mode>3"
+  [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x")
+	(smaxmin:AVXMODEF2P
+	  (match_operand:AVXMODEF2P 1 "nonimmediate_operand" "%x")
+	  (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")))]
+  "AVX_VEC_FLOAT_MODE_P (<MODE>mode)"
+  "v<maxminfprefix>p<avxmodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<avxvecmode>")])
+
 (define_insn "*<code><mode>3"
   [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x")
 	(smaxmin:SSEMODEF2P
@@ -570,6 +992,20 @@
   [(set_attr "type" "sseadd")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "*avx_vm<code><mode>3"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x")
+	(vec_merge:SSEMODEF2P
+	  (smaxmin:SSEMODEF2P
+	    (match_operand:SSEMODEF2P 1 "register_operand" "x")
+	    (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm"))
+	 (match_dup 1)
+	 (const_int 1)))]
+  "AVX128_VEC_FLOAT_MODE_P (<MODE>mode)"
+  "v<maxminfprefix>s<ssemodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<ssescalarmode>")])
+
 (define_insn "<sse>_vm<code><mode>3"
   [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x")
 	(vec_merge:SSEMODEF2P
@@ -589,6 +1025,30 @@
 ;; Their operands are not commutative, and thus they may be used in the
 ;; presence of -0.0 and NaN.
 
+(define_insn "*avx_ieee_smin<mode>3"
+  [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x")
+	(unspec:AVXMODEF2P
+	  [(match_operand:AVXMODEF2P 1 "register_operand" "x")
+	   (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")]
+	 UNSPEC_IEEE_MIN))]
+  "AVX_VEC_FLOAT_MODE_P (<MODE>mode)"
+  "vminp<avxmodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<avxvecmode>")])
+
+(define_insn "*avx_ieee_smax<mode>3"
+  [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x")
+	(unspec:AVXMODEF2P
+	  [(match_operand:AVXMODEF2P 1 "register_operand" "x")
+	   (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")]
+	 UNSPEC_IEEE_MAX))]
+  "AVX_VEC_FLOAT_MODE_P (<MODE>mode)"
+  "vmaxp<avxmodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<avxvecmode>")])
+
 (define_insn "*ieee_smin<mode>3"
   [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x")
 	(unspec:SSEMODEF2P
@@ -611,6 +1071,48 @@
   [(set_attr "type" "sseadd")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "avx_addsubv8sf3"
+  [(set (match_operand:V8SF 0 "register_operand" "=x")
+	(vec_merge:V8SF
+	  (plus:V8SF
+	    (match_operand:V8SF 1 "register_operand" "x")
+	    (match_operand:V8SF 2 "nonimmediate_operand" "xm"))
+	  (minus:V8SF (match_dup 1) (match_dup 2))
+	  (const_int 85)))]
+  "TARGET_AVX"
+  "vaddsubps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "avx_addsubv4df3"
+  [(set (match_operand:V4DF 0 "register_operand" "=x")
+	(vec_merge:V4DF
+	  (plus:V4DF
+	    (match_operand:V4DF 1 "register_operand" "x")
+	    (match_operand:V4DF 2 "nonimmediate_operand" "xm"))
+	  (minus:V4DF (match_dup 1) (match_dup 2))
+	  (const_int 5)))]
+  "TARGET_AVX"
+  "vaddsubpd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4DF")])
+
+(define_insn "*avx_addsubv4sf3"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+	(vec_merge:V4SF
+	  (plus:V4SF
+	    (match_operand:V4SF 1 "register_operand" "x")
+	    (match_operand:V4SF 2 "nonimmediate_operand" "xm"))
+	  (minus:V4SF (match_dup 1) (match_dup 2))
+	  (const_int 5)))]
+  "TARGET_AVX"
+  "vaddsubps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4SF")])
+
 (define_insn "sse3_addsubv4sf3"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
 	(vec_merge:V4SF
@@ -625,6 +1127,20 @@
    (set_attr "prefix_rep" "1")
    (set_attr "mode" "V4SF")])
 
+(define_insn "*avx_addsubv2df3"
+  [(set (match_operand:V2DF 0 "register_operand" "=x")
+	(vec_merge:V2DF
+	  (plus:V2DF
+	    (match_operand:V2DF 1 "register_operand" "x")
+	    (match_operand:V2DF 2 "nonimmediate_operand" "xm"))
+	  (minus:V2DF (match_dup 1) (match_dup 2))
+	  (const_int 1)))]
+  "TARGET_AVX"
+  "vaddsubpd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V2DF")])
+
 (define_insn "sse3_addsubv2df3"
   [(set (match_operand:V2DF 0 "register_operand" "=x")
 	(vec_merge:V2DF
@@ -638,6 +1154,103 @@
   [(set_attr "type" "sseadd")
    (set_attr "mode" "V2DF")])
 
+(define_insn "avx_h<plusminus_insn>v4df3"
+  [(set (match_operand:V4DF 0 "register_operand" "=x")
+	(vec_concat:V4DF
+	  (vec_concat:V2DF
+	    (plusminus:DF
+	      (vec_select:DF
+		(match_operand:V4DF 1 "register_operand" "x")
+		(parallel [(const_int 0)]))
+	      (vec_select:DF (match_dup 1) (parallel [(const_int 1)])))
+	    (plusminus:DF
+	      (vec_select:DF (match_dup 1) (parallel [(const_int 2)]))
+	      (vec_select:DF (match_dup 1) (parallel [(const_int 3)]))))
+	  (vec_concat:V2DF
+	    (plusminus:DF
+	      (vec_select:DF
+		(match_operand:V4DF 2 "nonimmediate_operand" "xm")
+		(parallel [(const_int 0)]))
+	      (vec_select:DF (match_dup 2) (parallel [(const_int 1)])))
+	    (plusminus:DF
+	      (vec_select:DF (match_dup 2) (parallel [(const_int 2)]))
+	      (vec_select:DF (match_dup 2) (parallel [(const_int 3)]))))))]
+  "TARGET_AVX"
+  "vh<plusminus_mnemonic>pd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4DF")])
+
+(define_insn "avx_h<plusminus_insn>v8sf3"
+  [(set (match_operand:V8SF 0 "register_operand" "=x")
+	(vec_concat:V8SF
+	  (vec_concat:V4SF
+	    (vec_concat:V2SF
+	      (plusminus:SF
+		(vec_select:SF
+		  (match_operand:V8SF 1 "register_operand" "x")
+		  (parallel [(const_int 0)]))
+		(vec_select:SF (match_dup 1) (parallel [(const_int 1)])))
+	      (plusminus:SF
+		(vec_select:SF (match_dup 1) (parallel [(const_int 2)]))
+		(vec_select:SF (match_dup 1) (parallel [(const_int 3)]))))
+	    (vec_concat:V2SF
+	      (plusminus:SF
+		(vec_select:SF
+		  (match_operand:V8SF 2 "nonimmediate_operand" "xm")
+		  (parallel [(const_int 0)]))
+		(vec_select:SF (match_dup 2) (parallel [(const_int 1)])))
+	      (plusminus:SF
+		(vec_select:SF (match_dup 2) (parallel [(const_int 2)]))
+		(vec_select:SF (match_dup 2) (parallel [(const_int 3)])))))
+	  (vec_concat:V4SF
+	    (vec_concat:V2SF
+	      (plusminus:SF
+		(vec_select:SF (match_dup 1) (parallel [(const_int 4)]))
+		(vec_select:SF (match_dup 1) (parallel [(const_int 5)])))
+	      (plusminus:SF
+		(vec_select:SF (match_dup 1) (parallel [(const_int 6)]))
+		(vec_select:SF (match_dup 1) (parallel [(const_int 7)]))))
+	    (vec_concat:V2SF
+	      (plusminus:SF
+		(vec_select:SF (match_dup 2) (parallel [(const_int 4)]))
+		(vec_select:SF (match_dup 2) (parallel [(const_int 5)])))
+	      (plusminus:SF
+		(vec_select:SF (match_dup 2) (parallel [(const_int 6)]))
+		(vec_select:SF (match_dup 2) (parallel [(const_int 7)])))))))]
+  "TARGET_AVX"
+  "vh<plusminus_mnemonic>ps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "*avx_h<plusminus_insn>v4sf3"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+	(vec_concat:V4SF
+	  (vec_concat:V2SF
+	    (plusminus:SF
+	      (vec_select:SF
+		(match_operand:V4SF 1 "register_operand" "x")
+		(parallel [(const_int 0)]))
+	      (vec_select:SF (match_dup 1) (parallel [(const_int 1)])))
+	    (plusminus:SF
+	      (vec_select:SF (match_dup 1) (parallel [(const_int 2)]))
+	      (vec_select:SF (match_dup 1) (parallel [(const_int 3)]))))
+	  (vec_concat:V2SF
+	    (plusminus:SF
+	      (vec_select:SF
+		(match_operand:V4SF 2 "nonimmediate_operand" "xm")
+		(parallel [(const_int 0)]))
+	      (vec_select:SF (match_dup 2) (parallel [(const_int 1)])))
+	    (plusminus:SF
+	      (vec_select:SF (match_dup 2) (parallel [(const_int 2)]))
+	      (vec_select:SF (match_dup 2) (parallel [(const_int 3)]))))))]
+  "TARGET_AVX"
+  "vh<plusminus_mnemonic>ps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4SF")])
+
 (define_insn "sse3_h<plusminus_insn>v4sf3"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
 	(vec_concat:V4SF
@@ -665,6 +1278,25 @@
    (set_attr "prefix_rep" "1")
    (set_attr "mode" "V4SF")])
 
+(define_insn "*avx_h<plusminus_insn>v2df3"
+  [(set (match_operand:V2DF 0 "register_operand" "=x")
+	(vec_concat:V2DF
+	  (plusminus:DF
+	    (vec_select:DF
+	      (match_operand:V2DF 1 "register_operand" "x")
+	      (parallel [(const_int 0)]))
+	    (vec_select:DF (match_dup 1) (parallel [(const_int 1)])))
+	  (plusminus:DF
+	    (vec_select:DF
+	      (match_operand:V2DF 2 "nonimmediate_operand" "xm")
+	      (parallel [(const_int 0)]))
+	    (vec_select:DF (match_dup 2) (parallel [(const_int 1)])))))]
+  "TARGET_AVX"
+  "vh<plusminus_mnemonic>pd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V2DF")])
+
 (define_insn "sse3_h<plusminus_insn>v2df3"
   [(set (match_operand:V2DF 0 "register_operand" "=x")
 	(vec_concat:V2DF
@@ -732,7 +1364,49 @@
 ;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
-(define_insn "<sse>_maskcmp<mode>3"
+(define_insn "avx_cmpp<avxmodesuffixf2c><mode>3"
+  [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x")
+	(unspec:AVXMODEF2P
+	  [(match_operand:AVXMODEF2P 1 "register_operand" "x")
+	   (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")
+	   (match_operand:SI 3 "const_0_to_31_operand" "n")]
+	  UNSPEC_PCMP))]
+  "TARGET_AVX"
+  "vcmpp<avxmodesuffixf2c>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx_cmps<ssemodesuffixf2c><mode>3"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "")
+	(vec_merge:SSEMODEF2P
+	  (unspec:SSEMODEF2P
+	    [(match_operand:SSEMODEF2P 1 "register_operand" "x")
+	     (match_operand:SSEMODEF2P 2 "nonimmediate_operand" "xm")
+	     (match_operand:SI 3 "const_0_to_31_operand" "n")]
+	    UNSPEC_PCMP)
+	 (match_dup 1)
+	 (const_int 1)))]
+  "TARGET_AVX"
+  "vcmps<ssemodesuffixf2c>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<ssescalarmode>")])
+
+;; We don't promote 128bit vector compare intrinsics. But vectorizer
+;; may generate 256bit vector compare instructions.
+(define_insn "*avx_maskcmp<mode>3"
+  [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x")
+	(match_operator:AVXMODEF2P 3 "avx_comparison_float_operator"
+		[(match_operand:AVXMODEF2P 1 "register_operand" "x")
+		 (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")]))]
+  "AVX_VEC_FLOAT_MODE_P (<MODE>mode)"
+  "vcmp%D3p<avxmodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<avxvecmode>")])
+
+(define_insn "<sse>_maskcmp<mode>3"
   [(set (match_operand:SSEMODEF4 0 "register_operand" "=x")
 	(match_operator:SSEMODEF4 3 "sse_comparison_operator"
 		[(match_operand:SSEMODEF4 1 "register_operand" "0")
@@ -766,8 +1440,9 @@
 	    (match_operand:<ssevecmode> 1 "nonimmediate_operand" "xm")
 	    (parallel [(const_int 0)]))))]
   "SSE_FLOAT_MODE_P (<MODE>mode)"
-  "comis<ssemodefsuffix>\t{%1, %0|%0, %1}"
+  "%vcomis<ssemodefsuffix>\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecomi")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "<MODE>")])
 
 (define_insn "<sse>_ucomi"
@@ -780,8 +1455,9 @@
 	    (match_operand:<ssevecmode> 1 "nonimmediate_operand" "xm")
 	    (parallel [(const_int 0)]))))]
   "SSE_FLOAT_MODE_P (<MODE>mode)"
-  "ucomis<ssemodefsuffix>\t{%1, %0|%0, %1}"
+  "%vucomis<ssemodefsuffix>\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecomi")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "<MODE>")])
 
 (define_expand "vcond<mode>"
@@ -806,6 +1482,18 @@
 ;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+(define_insn "avx_nand<mode>3"
+  [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x")
+	(and:AVXMODEF2P
+	  (not:AVXMODEF2P
+	    (match_operand:AVXMODEF2P 1 "register_operand" "x"))
+	  (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")))]
+  "AVX_VEC_FLOAT_MODE_P (<MODE>mode)"
+  "vandnp<avxmodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<avxvecmode>")])
+
 (define_insn "<sse>_nand<mode>3"
   [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x")
 	(and:SSEMODEF2P
@@ -818,6 +1506,26 @@
    (set_attr "mode" "<MODE>")])
 
 (define_expand "<code><mode>3"
+  [(set (match_operand:AVX256MODEF2P 0 "register_operand" "")
+	(plogic:AVX256MODEF2P
+	  (match_operand:AVX256MODEF2P 1 "nonimmediate_operand" "")
+	  (match_operand:AVX256MODEF2P 2 "nonimmediate_operand" "")))]
+  "AVX256_VEC_FLOAT_MODE_P (<MODE>mode)"
+  "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
+
+(define_insn "*avx_<code><mode>3"
+  [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x")
+	(plogic:AVXMODEF2P
+	  (match_operand:AVXMODEF2P 1 "nonimmediate_operand" "%x")
+	  (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")))]
+  "AVX_VEC_FLOAT_MODE_P (<MODE>mode)
+   && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+  "v<plogicprefix>p<avxmodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<avxvecmode>")])
+
+(define_expand "<code><mode>3"
   [(set (match_operand:SSEMODEF2P 0 "register_operand" "")
 	(plogic:SSEMODEF2P
 	  (match_operand:SSEMODEF2P 1 "nonimmediate_operand" "")
@@ -841,6 +1549,18 @@
 ;; allocation lossage.  These patterns do not allow memory operands
 ;; because the native instructions read the full 128-bits.
 
+(define_insn "*avx_nand<mode>3"
+  [(set (match_operand:MODEF 0 "register_operand" "=x")
+	(and:MODEF
+	  (not:MODEF
+	    (match_operand:MODEF 1 "register_operand" "x"))
+	    (match_operand:MODEF 2 "register_operand" "x")))]
+  "AVX_FLOAT_MODE_P (<MODE>mode)"
+  "vandnp<ssemodefsuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<ssevecmode>")])
+
 (define_insn "*nand<mode>3"
   [(set (match_operand:MODEF 0 "register_operand" "=x")
 	(and:MODEF
@@ -852,6 +1572,17 @@
   [(set_attr "type" "sselog")
    (set_attr "mode" "<ssevecmode>")])
 
+(define_insn "*avx_<code><mode>3"
+  [(set (match_operand:MODEF 0 "register_operand" "=x")
+	(plogic:MODEF
+	  (match_operand:MODEF 1 "register_operand" "x")
+	  (match_operand:MODEF 2 "register_operand" "x")))]
+  "AVX_FLOAT_MODE_P (<MODE>mode)"
+  "v<plogicprefix>p<ssemodefsuffix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<ssevecmode>")])
+
 (define_insn "*<code><mode>3"
   [(set (match_operand:MODEF 0 "register_operand" "=x")
 	(plogic:MODEF
@@ -1460,6 +2191,19 @@
    (set_attr "unit" "mmx")
    (set_attr "mode" "SF")])
 
+(define_insn "*avx_cvtsi2ss"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+	(vec_merge:V4SF
+	  (vec_duplicate:V4SF
+	    (float:SF (match_operand:SI 2 "nonimmediate_operand" "rm")))
+	  (match_operand:V4SF 1 "register_operand" "x")
+	  (const_int 1)))]
+  "TARGET_AVX"
+  "vcvtsi2ss\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "SF")])
+
 (define_insn "sse_cvtsi2ss"
   [(set (match_operand:V4SF 0 "register_operand" "=x,x")
 	(vec_merge:V4SF
@@ -1474,6 +2218,19 @@
    (set_attr "amdfam10_decode" "vector,double")
    (set_attr "mode" "SF")])
 
+(define_insn "*avx_cvtsi2ssq"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+	(vec_merge:V4SF
+	  (vec_duplicate:V4SF
+	    (float:SF (match_operand:DI 2 "nonimmediate_operand" "rm")))
+	  (match_operand:V4SF 1 "register_operand" "x")
+	  (const_int 1)))]
+  "TARGET_AVX && TARGET_64BIT"
+  "vcvtsi2ssq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "SF")])
+
 (define_insn "sse_cvtsi2ssq"
   [(set (match_operand:V4SF 0 "register_operand" "=x,x")
 	(vec_merge:V4SF
@@ -1496,10 +2253,11 @@
 	     (parallel [(const_int 0)]))]
 	  UNSPEC_FIX_NOTRUNC))]
   "TARGET_SSE"
-  "cvtss2si\t{%1, %0|%0, %1}"
+  "%vcvtss2si\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
    (set_attr "athlon_decode" "double,vector")
    (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "SI")])
 
 (define_insn "sse_cvtss2si_2"
@@ -1507,11 +2265,12 @@
 	(unspec:SI [(match_operand:SF 1 "nonimmediate_operand" "x,m")]
 		   UNSPEC_FIX_NOTRUNC))]
   "TARGET_SSE"
-  "cvtss2si\t{%1, %0|%0, %1}"
+  "%vcvtss2si\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
    (set_attr "athlon_decode" "double,vector")
    (set_attr "amdfam10_decode" "double,double")
    (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "SI")])
 
 (define_insn "sse_cvtss2siq"
@@ -1522,10 +2281,11 @@
 	     (parallel [(const_int 0)]))]
 	  UNSPEC_FIX_NOTRUNC))]
   "TARGET_SSE && TARGET_64BIT"
-  "cvtss2siq\t{%1, %0|%0, %1}"
+  "%vcvtss2siq\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
    (set_attr "athlon_decode" "double,vector")
    (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "DI")])
 
 (define_insn "sse_cvtss2siq_2"
@@ -1533,11 +2293,12 @@
 	(unspec:DI [(match_operand:SF 1 "nonimmediate_operand" "x,m")]
 		   UNSPEC_FIX_NOTRUNC))]
   "TARGET_SSE && TARGET_64BIT"
-  "cvtss2siq\t{%1, %0|%0, %1}"
+  "%vcvtss2siq\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
    (set_attr "athlon_decode" "double,vector")
    (set_attr "amdfam10_decode" "double,double")
    (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "DI")])
 
 (define_insn "sse_cvttss2si"
@@ -1547,11 +2308,12 @@
 	    (match_operand:V4SF 1 "nonimmediate_operand" "x,m")
 	    (parallel [(const_int 0)]))))]
   "TARGET_SSE"
-  "cvttss2si\t{%1, %0|%0, %1}"
+  "%vcvttss2si\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
    (set_attr "athlon_decode" "double,vector")
    (set_attr "amdfam10_decode" "double,double")
    (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "SI")])
 
 (define_insn "sse_cvttss2siq"
@@ -1561,13 +2323,24 @@
 	    (match_operand:V4SF 1 "nonimmediate_operand" "x,m")
 	    (parallel [(const_int 0)]))))]
   "TARGET_SSE && TARGET_64BIT"
-  "cvttss2siq\t{%1, %0|%0, %1}"
+  "%vcvttss2siq\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
    (set_attr "athlon_decode" "double,vector")
    (set_attr "amdfam10_decode" "double,double")
    (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "DI")])
 
+(define_insn "avx_cvtdq2ps<avxmodesuffix>"
+  [(set (match_operand:AVXMODEDCVTDQ2PS 0 "register_operand" "=x")
+	(float:AVXMODEDCVTDQ2PS
+	  (match_operand:<avxcvtvecmode> 1 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX"
+  "vcvtdq2ps\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<avxvecmode>")])
+
 (define_insn "sse2_cvtdq2ps"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
 	(float:V4SF (match_operand:V4SI 1 "nonimmediate_operand" "xm")))]
@@ -1576,6 +2349,17 @@
   [(set_attr "type" "ssecvt")
    (set_attr "mode" "V4SF")])
 
+(define_insn "avx_cvtps2dq<avxmodesuffix>"
+  [(set (match_operand:AVXMODEDCVTPS2DQ 0 "register_operand" "=x")
+	(unspec:AVXMODEDCVTPS2DQ
+	  [(match_operand:<avxcvtvecmode> 1 "nonimmediate_operand" "xm")]
+	  UNSPEC_FIX_NOTRUNC))]
+  "TARGET_AVX"
+  "vcvtps2dq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<avxvecmode>")])
+
 (define_insn "sse2_cvtps2dq"
   [(set (match_operand:V4SI 0 "register_operand" "=x")
 	(unspec:V4SI [(match_operand:V4SF 1 "nonimmediate_operand" "xm")]
@@ -1586,6 +2370,16 @@
    (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
+(define_insn "avx_cvttps2dq<avxmodesuffix>"
+  [(set (match_operand:AVXMODEDCVTPS2DQ 0 "register_operand" "=x")
+	(fix:AVXMODEDCVTPS2DQ
+	  (match_operand:<avxcvtvecmode> 1 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX"
+  "vcvttps2dq\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<avxvecmode>")])
+
 (define_insn "sse2_cvttps2dq"
   [(set (match_operand:V4SI 0 "register_operand" "=x")
 	(fix:V4SI (match_operand:V4SF 1 "nonimmediate_operand" "xm")))]
@@ -1631,6 +2425,19 @@
    (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
+(define_insn "*avx_cvtsi2sd"
+  [(set (match_operand:V2DF 0 "register_operand" "=x")
+	(vec_merge:V2DF
+	  (vec_duplicate:V2DF
+	    (float:DF (match_operand:SI 2 "nonimmediate_operand" "rm")))
+	  (match_operand:V2DF 1 "register_operand" "x")
+	  (const_int 1)))]
+  "TARGET_AVX"
+  "vcvtsi2sd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "DF")])
+
 (define_insn "sse2_cvtsi2sd"
   [(set (match_operand:V2DF 0 "register_operand" "=x,x")
 	(vec_merge:V2DF
@@ -1645,6 +2452,19 @@
    (set_attr "athlon_decode" "double,direct")
    (set_attr "amdfam10_decode" "vector,double")])
 
+(define_insn "*avx_cvtsi2sdq"
+  [(set (match_operand:V2DF 0 "register_operand" "=x")
+	(vec_merge:V2DF
+	  (vec_duplicate:V2DF
+	    (float:DF (match_operand:DI 2 "nonimmediate_operand" "rm")))
+	  (match_operand:V2DF 1 "register_operand" "x")
+	  (const_int 1)))]
+  "TARGET_AVX && TARGET_64BIT"
+  "vcvtsi2sdq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseicvt")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "DF")])
+
 (define_insn "sse2_cvtsi2sdq"
   [(set (match_operand:V2DF 0 "register_operand" "=x,x")
 	(vec_merge:V2DF
@@ -1667,10 +2487,11 @@
 	     (parallel [(const_int 0)]))]
 	  UNSPEC_FIX_NOTRUNC))]
   "TARGET_SSE2"
-  "cvtsd2si\t{%1, %0|%0, %1}"
+  "%vcvtsd2si\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
    (set_attr "athlon_decode" "double,vector")
    (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "SI")])
 
 (define_insn "sse2_cvtsd2si_2"
@@ -1678,11 +2499,12 @@
 	(unspec:SI [(match_operand:DF 1 "nonimmediate_operand" "x,m")]
 		   UNSPEC_FIX_NOTRUNC))]
   "TARGET_SSE2"
-  "cvtsd2si\t{%1, %0|%0, %1}"
+  "%vcvtsd2si\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
    (set_attr "athlon_decode" "double,vector")
    (set_attr "amdfam10_decode" "double,double")
    (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "SI")])
 
 (define_insn "sse2_cvtsd2siq"
@@ -1693,10 +2515,11 @@
 	     (parallel [(const_int 0)]))]
 	  UNSPEC_FIX_NOTRUNC))]
   "TARGET_SSE2 && TARGET_64BIT"
-  "cvtsd2siq\t{%1, %0|%0, %1}"
+  "%vcvtsd2siq\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
    (set_attr "athlon_decode" "double,vector")
    (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "DI")])
 
 (define_insn "sse2_cvtsd2siq_2"
@@ -1704,11 +2527,12 @@
 	(unspec:DI [(match_operand:DF 1 "nonimmediate_operand" "x,m")]
 		   UNSPEC_FIX_NOTRUNC))]
   "TARGET_SSE2 && TARGET_64BIT"
-  "cvtsd2siq\t{%1, %0|%0, %1}"
+  "%vcvtsd2siq\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
    (set_attr "athlon_decode" "double,vector")
    (set_attr "amdfam10_decode" "double,double")
    (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "DI")])
 
 (define_insn "sse2_cvttsd2si"
@@ -1718,9 +2542,10 @@
 	    (match_operand:V2DF 1 "nonimmediate_operand" "x,m")
 	    (parallel [(const_int 0)]))))]
   "TARGET_SSE2"
-  "cvttsd2si\t{%1, %0|%0, %1}"
+  "%vcvttsd2si\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
    (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "SI")
    (set_attr "athlon_decode" "double,vector")
    (set_attr "amdfam10_decode" "double,double")])
@@ -1732,13 +2557,23 @@
 	    (match_operand:V2DF 1 "nonimmediate_operand" "x,m")
 	    (parallel [(const_int 0)]))))]
   "TARGET_SSE2 && TARGET_64BIT"
-  "cvttsd2siq\t{%1, %0|%0, %1}"
+  "%vcvttsd2siq\t{%1, %0|%0, %1}"
   [(set_attr "type" "sseicvt")
    (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "DI")
    (set_attr "athlon_decode" "double,vector")
    (set_attr "amdfam10_decode" "double,double")])
 
+(define_insn "avx_cvtdq2pd256"
+  [(set (match_operand:V4DF 0 "register_operand" "=x")
+	(float:V4DF (match_operand:V4SI 1 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX"
+  "vcvtdq2pd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4DF")])
+
 (define_insn "sse2_cvtdq2pd"
   [(set (match_operand:V2DF 0 "register_operand" "=x")
 	(float:V2DF
@@ -1746,10 +2581,21 @@
 	    (match_operand:V4SI 1 "nonimmediate_operand" "xm")
 	    (parallel [(const_int 0) (const_int 1)]))))]
   "TARGET_SSE2"
-  "cvtdq2pd\t{%1, %0|%0, %1}"
+  "%vcvtdq2pd\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "V2DF")])
 
+(define_insn "avx_cvtpd2dq256"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(unspec:V4SI [(match_operand:V4DF 1 "nonimmediate_operand" "xm")]
+		     UNSPEC_FIX_NOTRUNC))]
+  "TARGET_AVX"
+  "vcvtpd2dq{y}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_expand "sse2_cvtpd2dq"
   [(set (match_operand:V4SI 0 "register_operand" "")
 	(vec_concat:V4SI
@@ -1766,12 +2612,23 @@
 		       UNSPEC_FIX_NOTRUNC)
 	  (match_operand:V2SI 2 "const0_operand" "")))]
   "TARGET_SSE2"
-  "cvtpd2dq\t{%1, %0|%0, %1}"
+  "* return TARGET_AVX ? \"vcvtpd2dq{x}\t{%1, %0|%0, %1}\"
+		       : \"cvtpd2dq\t{%1, %0|%0, %1}\";"
   [(set_attr "type" "ssecvt")
    (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")
    (set_attr "amdfam10_decode" "double")])
 
+(define_insn "avx_cvttpd2dq256"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(fix:V4SI (match_operand:V4DF 1 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX"
+  "vcvttpd2dq{y}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_expand "sse2_cvttpd2dq"
   [(set (match_operand:V4SI 0 "register_operand" "")
 	(vec_concat:V4SI
@@ -1786,12 +2643,28 @@
 	  (fix:V2SI (match_operand:V2DF 1 "nonimmediate_operand" "xm"))
 	  (match_operand:V2SI 2 "const0_operand" "")))]
   "TARGET_SSE2"
-  "cvttpd2dq\t{%1, %0|%0, %1}"
+  "* return TARGET_AVX ? \"vcvttpd2dq{x}\t{%1, %0|%0, %1}\"
+		       : \"cvttpd2dq\t{%1, %0|%0, %1}\";"
   [(set_attr "type" "ssecvt")
    (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")
    (set_attr "amdfam10_decode" "double")])
 
+(define_insn "*avx_cvtsd2ss"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+	(vec_merge:V4SF
+	  (vec_duplicate:V4SF
+	    (float_truncate:V2SF
+	      (match_operand:V2DF 2 "nonimmediate_operand" "xm")))
+	  (match_operand:V4SF 1 "register_operand" "x")
+	  (const_int 1)))]
+  "TARGET_AVX"
+  "vcvtsd2ss\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "SF")])
+
 (define_insn "sse2_cvtsd2ss"
   [(set (match_operand:V4SF 0 "register_operand" "=x,x")
 	(vec_merge:V4SF
@@ -1807,6 +2680,21 @@
    (set_attr "amdfam10_decode" "vector,double")
    (set_attr "mode" "SF")])
 
+(define_insn "*avx_cvtss2sd"
+  [(set (match_operand:V2DF 0 "register_operand" "=x")
+	(vec_merge:V2DF
+	  (float_extend:V2DF
+	    (vec_select:V2SF
+	      (match_operand:V4SF 2 "nonimmediate_operand" "xm")
+	      (parallel [(const_int 0) (const_int 1)])))
+	  (match_operand:V2DF 1 "register_operand" "x")
+	  (const_int 1)))]
+  "TARGET_AVX"
+  "vcvtss2sd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "DF")])
+
 (define_insn "sse2_cvtss2sd"
   [(set (match_operand:V2DF 0 "register_operand" "=x,x")
 	(vec_merge:V2DF
@@ -1822,6 +2710,16 @@
    (set_attr "amdfam10_decode" "vector,double")
    (set_attr "mode" "DF")])
 
+(define_insn "avx_cvtpd2ps256"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+	(float_truncate:V4SF
+	  (match_operand:V4DF 1 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX"
+  "vcvtpd2ps{y}\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4SF")])
+
 (define_expand "sse2_cvtpd2ps"
   [(set (match_operand:V4SF 0 "register_operand" "")
 	(vec_concat:V4SF
@@ -1838,12 +2736,24 @@
 	    (match_operand:V2DF 1 "nonimmediate_operand" "xm"))
 	  (match_operand:V2SF 2 "const0_operand" "")))]
   "TARGET_SSE2"
-  "cvtpd2ps\t{%1, %0|%0, %1}"
+  "* return TARGET_AVX ? \"vcvtpd2ps{x}\t{%1, %0|%0, %1}\"
+		       : \"cvtpd2ps\t{%1, %0|%0, %1}\";"
   [(set_attr "type" "ssecvt")
    (set_attr "prefix_data16" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "V4SF")
    (set_attr "amdfam10_decode" "double")])
 
+(define_insn "avx_cvtps2pd256"
+  [(set (match_operand:V4DF 0 "register_operand" "=x")
+	(float_extend:V4DF
+	  (match_operand:V4SF 1 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX"
+  "vcvtps2pd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4DF")])
+
 (define_insn "sse2_cvtps2pd"
   [(set (match_operand:V2DF 0 "register_operand" "=x")
 	(float_extend:V2DF
@@ -1851,8 +2761,9 @@
 	    (match_operand:V4SF 1 "nonimmediate_operand" "xm")
 	    (parallel [(const_int 0) (const_int 1)]))))]
   "TARGET_SSE2"
-  "cvtps2pd\t{%1, %0|%0, %1}"
+  "%vcvtps2pd\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "V2DF")
    (set_attr "amdfam10_decode" "direct")])
 
@@ -2032,6 +2943,25 @@
   "TARGET_SSE"
   "ix86_fixup_binary_operands (UNKNOWN, V4SFmode, operands);")
 
+(define_insn "*avx_movhlps"
+  [(set (match_operand:V4SF 0 "nonimmediate_operand"     "=x,x,m")
+	(vec_select:V4SF
+	  (vec_concat:V8SF
+	    (match_operand:V4SF 1 "nonimmediate_operand" " x,x,0")
+	    (match_operand:V4SF 2 "nonimmediate_operand" " x,o,x"))
+	  (parallel [(const_int 6)
+		     (const_int 7)
+		     (const_int 2)
+		     (const_int 3)])))]
+  "TARGET_AVX && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "@
+   vmovhlps\t{%2, %1, %0|%0, %1, %2}
+   vmovlps\t{%H2, %1, %0|%0, %1, %H2}
+   vmovhps\t{%2, %0|%0, %2}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4SF,V2SF,V2SF")])
+
 (define_insn "sse_movhlps"
   [(set (match_operand:V4SF 0 "nonimmediate_operand"     "=x,x,m")
 	(vec_select:V4SF
@@ -2063,6 +2993,25 @@
   "TARGET_SSE"
   "ix86_fixup_binary_operands (UNKNOWN, V4SFmode, operands);")
 
+(define_insn "*avx_movlhps"
+  [(set (match_operand:V4SF 0 "nonimmediate_operand"     "=x,x,o")
+	(vec_select:V4SF
+	  (vec_concat:V8SF
+	    (match_operand:V4SF 1 "nonimmediate_operand" " x,x,0")
+	    (match_operand:V4SF 2 "nonimmediate_operand" " x,m,x"))
+	  (parallel [(const_int 0)
+		     (const_int 1)
+		     (const_int 4)
+		     (const_int 5)])))]
+  "TARGET_AVX && ix86_binary_operator_ok (UNKNOWN, V4SFmode, operands)"
+  "@
+   vmovlhps\t{%2, %1, %0|%0, %1, %2}
+   vmovhps\t{%2, %1, %0|%0, %1, %2}
+   vmovlps\t{%2, %H0|%H0, %2}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4SF,V2SF,V2SF")])
+
 (define_insn "sse_movlhps"
   [(set (match_operand:V4SF 0 "nonimmediate_operand"     "=x,x,o")
 	(vec_select:V4SF
@@ -2081,6 +3030,36 @@
   [(set_attr "type" "ssemov")
    (set_attr "mode" "V4SF,V2SF,V2SF")])
 
+(define_insn "avx_unpckhps256"
+  [(set (match_operand:V8SF 0 "register_operand" "=x")
+	(vec_select:V8SF
+	  (vec_concat:V16SF
+	    (match_operand:V8SF 1 "register_operand" "x")
+	    (match_operand:V8SF 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 2) (const_int 6)
+		     (const_int 3) (const_int 7)
+		     (const_int 10) (const_int 14)
+		     (const_int 11) (const_int 15)])))]
+  "TARGET_AVX"
+  "vunpckhps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "*avx_unpckhps"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+	(vec_select:V4SF
+	  (vec_concat:V8SF
+	    (match_operand:V4SF 1 "register_operand" "x")
+	    (match_operand:V4SF 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 2) (const_int 6)
+		     (const_int 3) (const_int 7)])))]
+  "TARGET_AVX"
+  "vunpckhps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4SF")])
+
 (define_insn "sse_unpckhps"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
 	(vec_select:V4SF
@@ -2094,6 +3073,36 @@
   [(set_attr "type" "sselog")
    (set_attr "mode" "V4SF")])
 
+(define_insn "avx_unpcklps256"
+  [(set (match_operand:V8SF 0 "register_operand" "=x")
+	(vec_select:V8SF
+	  (vec_concat:V16SF
+	    (match_operand:V8SF 1 "register_operand" "x")
+	    (match_operand:V8SF 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 0) (const_int 4)
+		     (const_int 1) (const_int 5)
+		     (const_int 8) (const_int 12)
+		     (const_int 9) (const_int 13)])))]
+  "TARGET_AVX"
+  "vunpcklps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "*avx_unpcklps"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+	(vec_select:V4SF
+	  (vec_concat:V8SF
+	    (match_operand:V4SF 1 "register_operand" "x")
+	    (match_operand:V4SF 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 0) (const_int 4)
+		     (const_int 1) (const_int 5)])))]
+  "TARGET_AVX"
+  "vunpcklps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4SF")])
+
 (define_insn "sse_unpcklps"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
 	(vec_select:V4SF
@@ -2109,6 +3118,22 @@
 
 ;; These are modeled with the same vec_concat as the others so that we
 ;; capture users of shufps that can use the new instructions
+(define_insn "avx_movshdup256"
+  [(set (match_operand:V8SF 0 "register_operand" "=x")
+	(vec_select:V8SF
+	  (vec_concat:V16SF
+	    (match_operand:V8SF 1 "nonimmediate_operand" "xm")
+	    (match_dup 1))
+	  (parallel [(const_int 1) (const_int 1)
+		     (const_int 3) (const_int 3)
+		     (const_int 5) (const_int 5)
+		     (const_int 7) (const_int 7)])))]
+  "TARGET_AVX"
+  "vmovshdup\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
 (define_insn "sse3_movshdup"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
 	(vec_select:V4SF
@@ -2120,11 +3145,28 @@
 		     (const_int 7)
 		     (const_int 7)])))]
   "TARGET_SSE3"
-  "movshdup\t{%1, %0|%0, %1}"
+  "%vmovshdup\t{%1, %0|%0, %1}"
   [(set_attr "type" "sse")
    (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "V4SF")])
 
+(define_insn "avx_movsldup256"
+  [(set (match_operand:V8SF 0 "register_operand" "=x")
+	(vec_select:V8SF
+	  (vec_concat:V16SF
+	    (match_operand:V8SF 1 "nonimmediate_operand" "xm")
+	    (match_dup 1))
+	  (parallel [(const_int 0) (const_int 0)
+		     (const_int 2) (const_int 2)
+		     (const_int 4) (const_int 4)
+		     (const_int 6) (const_int 6)])))]
+  "TARGET_AVX"
+  "vmovsldup\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sse")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
 (define_insn "sse3_movsldup"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
 	(vec_select:V4SF
@@ -2136,11 +3178,66 @@
 		     (const_int 6)
 		     (const_int 6)])))]
   "TARGET_SSE3"
-  "movsldup\t{%1, %0|%0, %1}"
+  "%vmovsldup\t{%1, %0|%0, %1}"
   [(set_attr "type" "sse")
    (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "V4SF")])
 
+(define_expand "avx_shufps256"
+  [(match_operand:V8SF 0 "register_operand" "")
+   (match_operand:V8SF 1 "register_operand" "")
+   (match_operand:V8SF 2 "nonimmediate_operand" "")
+   (match_operand:SI 3 "const_int_operand" "")]
+  "TARGET_AVX"
+{
+  int mask = INTVAL (operands[3]);
+  emit_insn (gen_avx_shufps256_1 (operands[0], operands[1], operands[2],
+				  GEN_INT ((mask >> 0) & 3),
+				  GEN_INT ((mask >> 2) & 3),
+				  GEN_INT (((mask >> 4) & 3) + 8),
+				  GEN_INT (((mask >> 6) & 3) + 8),
+				  GEN_INT (((mask >> 0) & 3) + 4),
+				  GEN_INT (((mask >> 2) & 3) + 4),
+				  GEN_INT (((mask >> 4) & 3) + 12),
+				  GEN_INT (((mask >> 6) & 3) + 12)));
+  DONE;
+})
+
+;; One bit in mask selects 2 elements.
+(define_insn "avx_shufps256_1"
+  [(set (match_operand:V8SF 0 "register_operand" "=x")
+	(vec_select:V8SF
+	  (vec_concat:V16SF
+	    (match_operand:V8SF 1 "register_operand" "x")
+	    (match_operand:V8SF 2 "nonimmediate_operand" "xm"))
+	  (parallel [(match_operand 3  "const_0_to_3_operand"   "")
+		     (match_operand 4  "const_0_to_3_operand"   "")
+		     (match_operand 5  "const_8_to_11_operand"  "")
+		     (match_operand 6  "const_8_to_11_operand"  "")
+		     (match_operand 7  "const_4_to_7_operand"   "")
+		     (match_operand 8  "const_4_to_7_operand"   "")
+		     (match_operand 9  "const_12_to_15_operand" "")
+		     (match_operand 10 "const_12_to_15_operand" "")])))]
+  "TARGET_AVX
+   && (INTVAL (operands[3]) == (INTVAL (operands[7]) - 4)
+       && INTVAL (operands[4]) == (INTVAL (operands[8]) - 4)
+       && INTVAL (operands[5]) == (INTVAL (operands[9]) - 4)
+       && INTVAL (operands[6]) == (INTVAL (operands[10]) - 4))"
+{
+  int mask;
+  mask = INTVAL (operands[3]);
+  mask |= INTVAL (operands[4]) << 2;
+  mask |= (INTVAL (operands[5]) - 8) << 4;
+  mask |= (INTVAL (operands[6]) - 8) << 6;
+  operands[3] = GEN_INT (mask);
+
+  return "vshufps\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
 (define_expand "sse_shufps"
   [(match_operand:V4SF 0 "register_operand" "")
    (match_operand:V4SF 1 "register_operand" "")
@@ -2157,6 +3254,31 @@
   DONE;
 })
 
+(define_insn "*avx_shufps_<mode>"
+  [(set (match_operand:SSEMODE4S 0 "register_operand" "=x")
+	(vec_select:SSEMODE4S
+	  (vec_concat:<ssedoublesizemode>
+	    (match_operand:SSEMODE4S 1 "register_operand" "x")
+	    (match_operand:SSEMODE4S 2 "nonimmediate_operand" "xm"))
+	  (parallel [(match_operand 3 "const_0_to_3_operand" "")
+		     (match_operand 4 "const_0_to_3_operand" "")
+		     (match_operand 5 "const_4_to_7_operand" "")
+		     (match_operand 6 "const_4_to_7_operand" "")])))]
+  "TARGET_AVX"
+{
+  int mask = 0;
+  mask |= INTVAL (operands[3]) << 0;
+  mask |= INTVAL (operands[4]) << 2;
+  mask |= (INTVAL (operands[5]) - 4) << 4;
+  mask |= (INTVAL (operands[6]) - 4) << 6;
+  operands[3] = GEN_INT (mask);
+
+  return "vshufps\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4SF")])
+
 (define_insn "sse_shufps_<mode>"
   [(set (match_operand:SSEMODE4S 0 "register_operand" "=x")
 	(vec_select:SSEMODE4S
@@ -2188,10 +3310,11 @@
 	  (parallel [(const_int 2) (const_int 3)])))]
   "TARGET_SSE"
   "@
-   movhps\t{%1, %0|%0, %1}
-   movhlps\t{%1, %0|%0, %1}
-   movlps\t{%H1, %0|%0, %H1}"
+   %vmovhps\t{%1, %0|%0, %1}
+   %vmovhlps\t{%1, %d0|%d0, %1}
+   %vmovlps\t{%H1, %d0|%d0, %H1}"
   [(set_attr "type" "ssemov")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "V2SF,V4SF,V2SF")])
 
 (define_expand "sse_loadhps_exp"
@@ -2204,6 +3327,22 @@
   "TARGET_SSE"
   "ix86_fixup_binary_operands (UNKNOWN, V4SFmode, operands);")
 
+(define_insn "*avx_loadhps"
+  [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,x,o")
+	(vec_concat:V4SF
+	  (vec_select:V2SF
+	    (match_operand:V4SF 1 "nonimmediate_operand" "x,x,0")
+	    (parallel [(const_int 0) (const_int 1)]))
+	  (match_operand:V2SF 2 "nonimmediate_operand" "m,x,x")))]
+  "TARGET_AVX"
+  "@
+   vmovhps\t{%2, %1, %0|%0, %1, %2}
+   vmovlhps\t{%2, %1, %0|%0, %1, %2}
+   vmovlps\t{%2, %H0|%H0, %2}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V2SF,V4SF,V2SF")])
+
 (define_insn "sse_loadhps"
   [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,x,o")
 	(vec_concat:V4SF
@@ -2219,6 +3358,20 @@
   [(set_attr "type" "ssemov")
    (set_attr "mode" "V2SF,V4SF,V2SF")])
 
+(define_insn "*avx_storelps"
+  [(set (match_operand:V2SF 0 "nonimmediate_operand" "=m,x,x")
+	(vec_select:V2SF
+	  (match_operand:V4SF 1 "nonimmediate_operand" "x,x,m")
+	  (parallel [(const_int 0) (const_int 1)])))]
+  "TARGET_AVX"
+  "@
+   vmovlps\t{%1, %0|%0, %1}
+   vmovaps\t{%1, %0|%0, %1}
+   vmovlps\t{%1, %0, %0|%0, %0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V2SF,V2DF,V2SF")])
+
 (define_insn "sse_storelps"
   [(set (match_operand:V2SF 0 "nonimmediate_operand" "=m,x,x")
 	(vec_select:V2SF
@@ -2242,6 +3395,22 @@
   "TARGET_SSE"
   "ix86_fixup_binary_operands (UNKNOWN, V4SFmode, operands);")
 
+(define_insn "*avx_loadlps"
+  [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,x,m")
+	(vec_concat:V4SF
+	  (match_operand:V2SF 2 "nonimmediate_operand" "x,m,x")
+	  (vec_select:V2SF
+	    (match_operand:V4SF 1 "nonimmediate_operand" "x,x,0")
+	    (parallel [(const_int 2) (const_int 3)]))))]
+  "TARGET_AVX"
+  "@
+   shufps\t{$0xe4, %1, %2, %0|%0, %2, %1, 0xe4}
+   vmovlps\t{%2, %1, %0|%0, %1, %2}
+   vmovlps\t{%2, %0|%0, %2}"
+  [(set_attr "type" "sselog,ssemov,ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4SF,V2SF,V2SF")])
+
 (define_insn "sse_loadlps"
   [(set (match_operand:V4SF 0 "nonimmediate_operand" "=x,x,m")
 	(vec_concat:V4SF
@@ -2257,6 +3426,18 @@
   [(set_attr "type" "sselog,ssemov,ssemov")
    (set_attr "mode" "V4SF,V2SF,V2SF")])
 
+(define_insn "*avx_movss"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+	(vec_merge:V4SF
+	  (match_operand:V4SF 2 "register_operand" "x")
+	  (match_operand:V4SF 1 "register_operand" "x")
+	  (const_int 1)))]
+  "TARGET_AVX"
+  "vmovss\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "SF")])
+
 (define_insn "sse_movss"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
 	(vec_merge:V4SF
@@ -2268,6 +3449,16 @@
   [(set_attr "type" "ssemov")
    (set_attr "mode" "SF")])
 
+(define_insn "*vec_dupv4sf_avx"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+	(vec_duplicate:V4SF
+	  (match_operand:SF 1 "register_operand" "x")))]
+  "TARGET_AVX"
+  "vshufps\t{$0, %1, %1, %0|%0, %1, %1, 0}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4SF")])
+
 (define_insn "*vec_dupv4sf"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
 	(vec_duplicate:V4SF
@@ -2277,6 +3468,25 @@
   [(set_attr "type" "sselog1")
    (set_attr "mode" "V4SF")])
 
+(define_insn "*vec_concatv2sf_avx"
+  [(set (match_operand:V2SF 0 "register_operand"     "=x,x,x,*y ,*y")
+	(vec_concat:V2SF
+	  (match_operand:SF 1 "nonimmediate_operand" " x,x,m, x , m")
+	  (match_operand:SF 2 "vector_move_operand"  " x,m,C,*ym, C")))]
+  "TARGET_AVX"
+  "@
+   vunpcklps\t{%2, %1, %0|%0, %1, %2}
+   vinsertps\t{$0x10, %2, %1, %0|%0, %1, %2, 0x10}
+   vmovss\t{%1, %0|%0, %1}
+   punpckldq\t{%2, %0|%0, %2}
+   movd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sselog,sselog,ssemov,mmxcvt,mmxmov")
+   (set (attr "prefix")
+     (if_then_else (eq_attr "alternative" "3,4")
+       (const_string "orig")
+       (const_string "vex")))
+   (set_attr "mode" "V4SF,V4SF,SF,DI,DI")])
+
 ;; Although insertps takes register source, we prefer
 ;; unpcklps with register source since it is shorter.
 (define_insn "*vec_concatv2sf_sse4_1"
@@ -2312,6 +3522,19 @@
   [(set_attr "type" "sselog,ssemov,mmxcvt,mmxmov")
    (set_attr "mode" "V4SF,SF,DI,DI")])
 
+(define_insn "*vec_concatv4sf_avx"
+  [(set (match_operand:V4SF 0 "register_operand"   "=x,x")
+	(vec_concat:V4SF
+	  (match_operand:V2SF 1 "register_operand" " x,x")
+	  (match_operand:V2SF 2 "nonimmediate_operand" " x,m")))]
+  "TARGET_AVX"
+  "@
+   vmovlhps\t{%2, %1, %0|%0, %1, %2}
+   vmovhps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4SF,V2SF")])
+
 (define_insn "*vec_concatv4sf_sse"
   [(set (match_operand:V4SF 0 "register_operand"   "=x,x")
 	(vec_concat:V4SF
@@ -2333,23 +3556,56 @@
   DONE;
 })
 
-(define_insn "vec_setv4sf_0"
-  [(set (match_operand:V4SF 0 "nonimmediate_operand"  "=x,x,Y2,m")
+(define_insn "*vec_setv4sf_0_avx"
+  [(set (match_operand:V4SF 0 "nonimmediate_operand"  "=x,x,x,m")
 	(vec_merge:V4SF
 	  (vec_duplicate:V4SF
 	    (match_operand:SF 2 "general_operand"     " x,m,*r,x*rfF"))
-	  (match_operand:V4SF 1 "vector_move_operand" " 0,C,C ,0")
+	  (match_operand:V4SF 1 "vector_move_operand" " x,C,C ,0")
 	  (const_int 1)))]
-  "TARGET_SSE"
+  "TARGET_AVX"
   "@
-   movss\t{%2, %0|%0, %2}
-   movss\t{%2, %0|%0, %2}
+   vmovss\t{%2, %1, %0|%0, %1, %2}
+   vmovss\t{%2, %0|%0, %2}
+   vmovd\t{%2, %0|%0, %2}
+   #"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "SF")])
+
+(define_insn "vec_setv4sf_0"
+  [(set (match_operand:V4SF 0 "nonimmediate_operand"  "=x,x,Y2,m")
+	(vec_merge:V4SF
+	  (vec_duplicate:V4SF
+	    (match_operand:SF 2 "general_operand"     " x,m,*r,x*rfF"))
+	  (match_operand:V4SF 1 "vector_move_operand" " 0,C,C ,0")
+	  (const_int 1)))]
+  "TARGET_SSE"
+  "@
+   movss\t{%2, %0|%0, %2}
+   movss\t{%2, %0|%0, %2}
    movd\t{%2, %0|%0, %2}
    #"
   [(set_attr "type" "ssemov")
    (set_attr "mode" "SF")])
 
 ;; A subset is vec_setv4sf.
+(define_insn "*vec_setv4sf_avx"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+	(vec_merge:V4SF
+	  (vec_duplicate:V4SF
+	    (match_operand:SF 2 "nonimmediate_operand" "xm"))
+	  (match_operand:V4SF 1 "register_operand" "x")
+	  (match_operand:SI 3 "const_pow2_1_to_8_operand" "n")))]
+  "TARGET_AVX"
+{
+  operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3])) << 4);
+  return "vinsertps\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4SF")])
+
 (define_insn "*vec_setv4sf_sse4_1"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
 	(vec_merge:V4SF
@@ -2366,6 +3622,18 @@
    (set_attr "prefix_extra" "1")
    (set_attr "mode" "V4SF")])
 
+(define_insn "*avx_insertps"
+  [(set (match_operand:V4SF 0 "register_operand" "=x")
+	(unspec:V4SF [(match_operand:V4SF 2 "nonimmediate_operand" "xm")
+		      (match_operand:V4SF 1 "register_operand" "x")
+		      (match_operand:SI 3 "const_0_to_255_operand" "n")]
+		     UNSPEC_INSERTPS))]
+  "TARGET_AVX"
+  "vinsertps\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4SF")])
+
 (define_insn "sse4_1_insertps"
   [(set (match_operand:V4SF 0 "register_operand" "=x")
 	(unspec:V4SF [(match_operand:V4SF 2 "register_operand" "x")
@@ -2422,15 +3690,154 @@
   DONE;
 })
 
+(define_expand "avx_vextractf128<mode>"
+  [(match_operand:<avxhalfvecmode> 0 "nonimmediate_operand" "")
+   (match_operand:AVX256MODE 1 "register_operand" "")
+   (match_operand:SI 2 "const_0_to_1_operand" "")]
+  "TARGET_AVX"
+{
+  switch (INTVAL (operands[2]))
+    {
+    case 0:
+      emit_insn (gen_vec_extract_lo_<mode> (operands[0], operands[1]));
+      break;
+    case 1:
+      emit_insn (gen_vec_extract_hi_<mode> (operands[0], operands[1]));
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  DONE;
+})
+
+(define_insn "vec_extract_lo_<mode>"
+  [(set (match_operand:<avxhalfvecmode> 0 "nonimmediate_operand" "=x,m")
+	(vec_select:<avxhalfvecmode>
+	  (match_operand:AVX256MODE4P 1 "register_operand" "x,x")
+	  (parallel [(const_int 0) (const_int 1)])))]
+  "TARGET_AVX"
+  "vextractf128\t{$0x0, %1, %0|%0, %1, 0x0}"
+  [(set_attr "type" "sselog")
+   (set_attr "memory" "none,store")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "vec_extract_hi_<mode>"
+  [(set (match_operand:<avxhalfvecmode> 0 "nonimmediate_operand" "=x,m")
+	(vec_select:<avxhalfvecmode>
+	  (match_operand:AVX256MODE4P 1 "register_operand" "x,x")
+	  (parallel [(const_int 2) (const_int 3)])))]
+  "TARGET_AVX"
+  "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "memory" "none,store")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "vec_extract_lo_<mode>"
+  [(set (match_operand:<avxhalfvecmode> 0 "nonimmediate_operand" "=x,m")
+	(vec_select:<avxhalfvecmode>
+	  (match_operand:AVX256MODE8P 1 "register_operand" "x,x")
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 3)])))]
+  "TARGET_AVX"
+  "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "memory" "none,store")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "vec_extract_hi_<mode>"
+  [(set (match_operand:<avxhalfvecmode> 0 "nonimmediate_operand" "=x,m")
+	(vec_select:<avxhalfvecmode>
+	  (match_operand:AVX256MODE8P 1 "register_operand" "x,x")
+	  (parallel [(const_int 4) (const_int 5)
+		     (const_int 6) (const_int 7)])))]
+  "TARGET_AVX"
+  "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "memory" "none,store")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "vec_extract_lo_v16hi"
+  [(set (match_operand:V8HI 0 "nonimmediate_operand" "=x,m")
+	(vec_select:V8HI
+	  (match_operand:V16HI 1 "register_operand" "x,x")
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 3)
+		     (const_int 4) (const_int 5)
+		     (const_int 6) (const_int 7)])))]
+  "TARGET_AVX"
+  "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "memory" "none,store")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "vec_extract_hi_v16hi"
+  [(set (match_operand:V8HI 0 "nonimmediate_operand" "=x,m")
+	(vec_select:V8HI
+	  (match_operand:V16HI 1 "register_operand" "x,x")
+	  (parallel [(const_int 8) (const_int 9)
+		     (const_int 10) (const_int 11)
+		     (const_int 12) (const_int 13)
+		     (const_int 14) (const_int 15)])))]
+  "TARGET_AVX"
+  "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "memory" "none,store")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "vec_extract_lo_v32qi"
+  [(set (match_operand:V16QI 0 "nonimmediate_operand" "=x,m")
+	(vec_select:V16QI
+	  (match_operand:V32QI 1 "register_operand" "x,x")
+	  (parallel [(const_int 0) (const_int 1)
+		     (const_int 2) (const_int 3)
+		     (const_int 4) (const_int 5)
+		     (const_int 6) (const_int 7)
+		     (const_int 8) (const_int 9)
+		     (const_int 10) (const_int 11)
+		     (const_int 12) (const_int 13)
+		     (const_int 14) (const_int 15)])))]
+  "TARGET_AVX"
+  "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "memory" "none,store")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "vec_extract_hi_v32qi"
+  [(set (match_operand:V16QI 0 "nonimmediate_operand" "=x,m")
+	(vec_select:V16QI
+	  (match_operand:V32QI 1 "register_operand" "x,x")
+	  (parallel [(const_int 16) (const_int 17)
+		     (const_int 18) (const_int 19)
+		     (const_int 20) (const_int 21)
+		     (const_int 22) (const_int 23)
+		     (const_int 24) (const_int 25)
+		     (const_int 26) (const_int 27)
+		     (const_int 28) (const_int 29)
+		     (const_int 30) (const_int 31)])))]
+  "TARGET_AVX"
+  "vextractf128\t{$0x1, %1, %0|%0, %1, 0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "memory" "none,store")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
 (define_insn "*sse4_1_extractps"
   [(set (match_operand:SF 0 "nonimmediate_operand" "=rm")
 	(vec_select:SF
 	  (match_operand:V4SF 1 "register_operand" "x")
 	  (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n")])))]
   "TARGET_SSE4_1"
-  "extractps\t{%2, %1, %0|%0, %1, %2}"
+  "%vextractps\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "sselog")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "V4SF")])
 
 (define_insn_and_split "*vec_extract_v4sf_mem"
@@ -2466,6 +3873,20 @@
 ;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+(define_insn "avx_unpckhpd256"
+  [(set (match_operand:V4DF 0 "register_operand" "=x")
+	(vec_select:V4DF
+	  (vec_concat:V8DF
+	    (match_operand:V4DF 1 "register_operand" "x")
+	    (match_operand:V4DF 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 2) (const_int 6)
+		     (const_int 3) (const_int 7)])))]
+  "TARGET_AVX"
+  "vunpckhpd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4DF")])
+
 (define_expand "sse2_unpckhpd_exp"
   [(set (match_operand:V2DF 0 "nonimmediate_operand" "")
 	(vec_select:V2DF
@@ -2477,6 +3898,23 @@
   "TARGET_SSE2"
   "ix86_fixup_binary_operands (UNKNOWN, V2DFmode, operands);")
 
+(define_insn "*avx_unpckhpd"
+  [(set (match_operand:V2DF 0 "nonimmediate_operand"     "=x,x,m")
+	(vec_select:V2DF
+	  (vec_concat:V4DF
+	    (match_operand:V2DF 1 "nonimmediate_operand" " x,o,x")
+	    (match_operand:V2DF 2 "nonimmediate_operand" " x,x,0"))
+	  (parallel [(const_int 1)
+		     (const_int 3)])))]
+  "TARGET_AVX && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "@
+   vunpckhpd\t{%2, %1, %0|%0, %1, %2}
+   vmovlpd\t{%H1, %2, %0|%0, %2, %H1}
+   vmovhpd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sselog,ssemov,ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V2DF,V1DF,V1DF")])
+
 (define_insn "sse2_unpckhpd"
   [(set (match_operand:V2DF 0 "nonimmediate_operand"     "=x,x,m")
 	(vec_select:V2DF
@@ -2493,6 +3931,36 @@
   [(set_attr "type" "sselog,ssemov,ssemov")
    (set_attr "mode" "V2DF,V1DF,V1DF")])
 
+(define_insn "avx_movddup256"
+  [(set (match_operand:V4DF 0 "register_operand" "=x")
+	(vec_select:V4DF
+	  (vec_concat:V8DF
+	    (match_operand:V4DF 1 "nonimmediate_operand" "xm")
+	    (match_dup 1))
+	  (parallel [(const_int 0) (const_int 2)
+		     (const_int 4) (const_int 6)])))]
+  "TARGET_AVX"
+  "vmovddup\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4DF")])
+
+(define_insn "*avx_movddup"
+  [(set (match_operand:V2DF 0 "nonimmediate_operand"     "=x,o")
+	(vec_select:V2DF
+	  (vec_concat:V4DF
+	    (match_operand:V2DF 1 "nonimmediate_operand" "xm,x")
+	    (match_dup 1))
+	  (parallel [(const_int 0)
+		     (const_int 2)])))]
+  "TARGET_AVX && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "@
+   vmovddup\t{%1, %0|%0, %1}
+   #"
+  [(set_attr "type" "sselog1,ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V2DF")])
+
 (define_insn "*sse3_movddup"
   [(set (match_operand:V2DF 0 "nonimmediate_operand"     "=x,o")
 	(vec_select:V2DF
@@ -2525,6 +3993,20 @@
   DONE;
 })
 
+(define_insn "avx_unpcklpd256"
+  [(set (match_operand:V4DF 0 "register_operand" "=x")
+	(vec_select:V4DF
+	  (vec_concat:V8DF
+	    (match_operand:V4DF 1 "register_operand" "x")
+	    (match_operand:V4DF 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 0) (const_int 4)
+		     (const_int 1) (const_int 5)])))]
+  "TARGET_AVX"
+  "vunpcklpd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4DF")])
+
 (define_expand "sse2_unpcklpd_exp"
   [(set (match_operand:V2DF 0 "nonimmediate_operand" "")
 	(vec_select:V2DF
@@ -2536,6 +4018,23 @@
   "TARGET_SSE2"
   "ix86_fixup_binary_operands (UNKNOWN, V2DFmode, operands);")
 
+(define_insn "*avx_unpcklpd"
+  [(set (match_operand:V2DF 0 "nonimmediate_operand"     "=x,x,o")
+	(vec_select:V2DF
+	  (vec_concat:V4DF
+	    (match_operand:V2DF 1 "nonimmediate_operand" " x,x,0")
+	    (match_operand:V2DF 2 "nonimmediate_operand" " x,m,x"))
+	  (parallel [(const_int 0)
+		     (const_int 2)])))]
+  "TARGET_AVX && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "@
+   vunpcklpd\t{%2, %1, %0|%0, %1, %2}
+   vmovhpd\t{%2, %1, %0|%0, %1, %2}
+   vmovlpd\t{%2, %H0|%H0, %2}"
+  [(set_attr "type" "sselog,ssemov,ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V2DF,V1DF,V1DF")])
+
 (define_insn "sse2_unpcklpd"
   [(set (match_operand:V2DF 0 "nonimmediate_operand"     "=x,x,o")
 	(vec_select:V2DF
@@ -2552,6 +4051,47 @@
   [(set_attr "type" "sselog,ssemov,ssemov")
    (set_attr "mode" "V2DF,V1DF,V1DF")])
 
+(define_expand "avx_shufpd256"
+  [(match_operand:V4DF 0 "register_operand" "")
+   (match_operand:V4DF 1 "register_operand" "")
+   (match_operand:V4DF 2 "nonimmediate_operand" "")
+   (match_operand:SI 3 "const_int_operand" "")]
+  "TARGET_AVX"
+{
+  int mask = INTVAL (operands[3]);
+  emit_insn (gen_avx_shufpd256_1 (operands[0], operands[1], operands[2],
+				   GEN_INT (mask & 1),
+				   GEN_INT (mask & 2 ? 5 : 4),
+				   GEN_INT (mask & 4 ? 3 : 2),
+				   GEN_INT (mask & 8 ? 7 : 6)));
+  DONE;
+})
+
+(define_insn "avx_shufpd256_1"
+  [(set (match_operand:V4DF 0 "register_operand" "=x")
+	(vec_select:V4DF
+	  (vec_concat:V8DF
+	    (match_operand:V4DF 1 "register_operand" "x")
+	    (match_operand:V4DF 2 "nonimmediate_operand" "xm"))
+	  (parallel [(match_operand 3 "const_0_to_1_operand" "")
+		     (match_operand 4 "const_4_to_5_operand" "")
+		     (match_operand 5 "const_2_to_3_operand" "")
+		     (match_operand 6 "const_6_to_7_operand" "")])))]
+  "TARGET_AVX"
+{
+  int mask;
+  mask = INTVAL (operands[3]);
+  mask |= (INTVAL (operands[4]) - 4) << 1;
+  mask |= (INTVAL (operands[5]) - 2) << 2;
+  mask |= (INTVAL (operands[6]) - 6) << 3;
+  operands[3] = GEN_INT (mask);
+
+  return "vshufpd\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4DF")])
+
 (define_expand "sse2_shufpd"
   [(match_operand:V2DF 0 "register_operand" "")
    (match_operand:V2DF 1 "register_operand" "")
@@ -2611,6 +4151,20 @@
   "TARGET_SSE2")
 
 ;; punpcklqdq and punpckhqdq are shorter than shufpd.
+(define_insn "*avx_punpckhqdq"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(vec_select:V2DI
+	  (vec_concat:V4DI
+	    (match_operand:V2DI 1 "register_operand" "x")
+	    (match_operand:V2DI 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 1)
+		     (const_int 3)])))]
+  "TARGET_AVX"
+  "vpunpckhqdq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "sse2_punpckhqdq"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
 	(vec_select:V2DI
@@ -2625,6 +4179,20 @@
    (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
+(define_insn "*avx_punpcklqdq"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(vec_select:V2DI
+	  (vec_concat:V4DI
+	    (match_operand:V2DI 1 "register_operand" "x")
+	    (match_operand:V2DI 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 0)
+		     (const_int 2)])))]
+  "TARGET_AVX"
+  "vpunpcklqdq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "sse2_punpcklqdq"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
 	(vec_select:V2DI
@@ -2639,6 +4207,27 @@
    (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
+(define_insn "*avx_shufpd_<mode>"
+  [(set (match_operand:SSEMODE2D 0 "register_operand" "=x")
+	(vec_select:SSEMODE2D
+	  (vec_concat:<ssedoublesizemode>
+	    (match_operand:SSEMODE2D 1 "register_operand" "x")
+	    (match_operand:SSEMODE2D 2 "nonimmediate_operand" "xm"))
+	  (parallel [(match_operand 3 "const_0_to_1_operand" "")
+		     (match_operand 4 "const_2_to_3_operand" "")])))]
+  "TARGET_AVX"
+{
+  int mask;
+  mask = INTVAL (operands[3]);
+  mask |= (INTVAL (operands[4]) - 2) << 1;
+  operands[3] = GEN_INT (mask);
+
+  return "vshufpd\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V2DF")])
+
 (define_insn "sse2_shufpd_<mode>"
   [(set (match_operand:SSEMODE2D 0 "register_operand" "=x")
 	(vec_select:SSEMODE2D
@@ -2661,6 +4250,22 @@
 
 ;; Avoid combining registers from different units in a single alternative,
 ;; see comment above inline_secondary_memory_needed function in i386.c
+(define_insn "*avx_storehpd"
+  [(set (match_operand:DF 0 "nonimmediate_operand"     "=m,x,x,*f,r")
+	(vec_select:DF
+	  (match_operand:V2DF 1 "nonimmediate_operand" " x,x,o,o,o")
+	  (parallel [(const_int 1)])))]
+  "TARGET_AVX && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "@
+   vmovhpd\t{%1, %0|%0, %1}
+   vunpckhpd\t{%1, %1, %0|%0, %1, %1}
+   #
+   #
+   #"
+  [(set_attr "type" "ssemov,sselog1,ssemov,fmov,imov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V1DF,V2DF,DF,DF,DF")])
+
 (define_insn "sse2_storehpd"
   [(set (match_operand:DF 0 "nonimmediate_operand"     "=m,x,x,*f,r")
 	(vec_select:DF
@@ -2696,12 +4301,13 @@
 	  (parallel [(const_int 0)])))]
   "TARGET_SSE2 && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
   "@
-   movlpd\t{%1, %0|%0, %1}
+   %vmovlpd\t{%1, %0|%0, %1}
    #
    #
    #
    #"
   [(set_attr "type" "ssemov,ssemov,ssemov,fmov,imov")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "V1DF,DF,DF,DF,DF")])
 
 (define_split
@@ -2733,6 +4339,24 @@
 
 ;; Avoid combining registers from different units in a single alternative,
 ;; see comment above inline_secondary_memory_needed function in i386.c
+(define_insn "*avx_loadhpd"
+  [(set (match_operand:V2DF 0 "nonimmediate_operand"     "=x,x,o,o,o")
+	(vec_concat:V2DF
+	  (vec_select:DF
+	    (match_operand:V2DF 1 "nonimmediate_operand" " x,x,0,0,0")
+	    (parallel [(const_int 0)]))
+	  (match_operand:DF 2 "nonimmediate_operand"     " m,x,x,*f,r")))]
+  "TARGET_AVX && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "@
+   vmovhpd\t{%2, %1, %0|%0, %1, %2}
+   vunpcklpd\t{%2, %1, %0|%0, %1, %2}
+   #
+   #
+   #"
+  [(set_attr "type" "ssemov,sselog,ssemov,fmov,imov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V1DF,V2DF,DF,DF,DF")])
+
 (define_insn "sse2_loadhpd"
   [(set (match_operand:V2DF 0 "nonimmediate_operand"     "=x,x,x,o,o,o")
 	(vec_concat:V2DF
@@ -2774,6 +4398,26 @@
 
 ;; Avoid combining registers from different units in a single alternative,
 ;; see comment above inline_secondary_memory_needed function in i386.c
+(define_insn "*avx_loadlpd"
+  [(set (match_operand:V2DF 0 "nonimmediate_operand"    "=x,x,x,x,m,m,m")
+	(vec_concat:V2DF
+	  (match_operand:DF 2 "nonimmediate_operand"    " m,m,x,x,x,*f,r")
+	  (vec_select:DF
+	    (match_operand:V2DF 1 "vector_move_operand" " C,x,x,o,0,0,0")
+	    (parallel [(const_int 1)]))))]
+  "TARGET_AVX && !(MEM_P (operands[1]) && MEM_P (operands[2]))"
+  "@
+   vmovsd\t{%2, %0|%0, %2}
+   vmovlpd\t{%2, %1, %0|%0, %1, %2}
+   vmovsd\t{%2, %1, %0|%0, %1, %2}
+   vmovhpd\t{%H1, %2, %0|%0, %2, %H1}
+   #
+   #
+   #"
+  [(set_attr "type" "ssemov,ssemov,ssemov,ssemov,ssemov,fmov,imov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "DF,V1DF,V1DF,V1DF,DF,DF,DF")])
+
 (define_insn "sse2_loadlpd"
   [(set (match_operand:V2DF 0 "nonimmediate_operand"    "=x,x,x,x,x,m,m,m")
 	(vec_concat:V2DF
@@ -2835,6 +4479,23 @@
   [(set_attr "type" "ssemov")
    (set_attr "mode" "V2SF,V4SF,V2SF")])
 
+(define_insn "*avx_movsd"
+  [(set (match_operand:V2DF 0 "nonimmediate_operand"   "=x,x,m,x,o")
+	(vec_merge:V2DF
+	  (match_operand:V2DF 2 "nonimmediate_operand" " x,m,x,x,0")
+	  (match_operand:V2DF 1 "nonimmediate_operand" " x,x,0,o,x")
+	  (const_int 1)))]
+  "TARGET_AVX"
+  "@
+   vmovsd\t{%2, %1, %0|%0, %1, %2}
+   vmovlpd\t{%2, %1, %0|%0, %1, %2}
+   vmovlpd\t{%2, %0|%0, %2}
+   vmovhps\t{%H1, %2, %0|%0, %2, %H1}
+   vmovhps\t{%1, %H0|%H0, %1}"
+  [(set_attr "type" "ssemov,ssemov,ssemov,ssemov,ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "DF,V1DF,V1DF,V1DF,V1DF")])
+
 (define_insn "sse2_movsd"
   [(set (match_operand:V2DF 0 "nonimmediate_operand"   "=x,x,m,x,x,o")
 	(vec_merge:V2DF
@@ -2857,8 +4518,9 @@
 	(vec_duplicate:V2DF
 	  (match_operand:DF 1 "nonimmediate_operand" "xm")))]
   "TARGET_SSE3"
-  "movddup\t{%1, %0|%0, %1}"
+  "%vmovddup\t{%1, %0|%0, %1}"
   [(set_attr "type" "sselog1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "DF")])
 
 (define_insn "vec_dupv2df"
@@ -2876,10 +4538,25 @@
 	  (match_operand:DF 1 "nonimmediate_operand" "xm")
 	  (match_dup 1)))]
   "TARGET_SSE3"
-  "movddup\t{%1, %0|%0, %1}"
+  "%vmovddup\t{%1, %0|%0, %1}"
   [(set_attr "type" "sselog1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "DF")])
 
+(define_insn "*vec_concatv2df_avx"
+  [(set (match_operand:V2DF 0 "register_operand"     "=x,x,x")
+	(vec_concat:V2DF
+	  (match_operand:DF 1 "nonimmediate_operand" " x,x,m")
+	  (match_operand:DF 2 "vector_move_operand"  " x,m,C")))]
+  "TARGET_AVX"
+  "@
+   vunpcklpd\t{%2, %1, %0|%0, %1, %2}
+   vmovhpd\t{%2, %1, %0|%0, %1, %2}
+   vmovsd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "DF,V1DF,DF")])
+
 (define_insn "*vec_concatv2df"
   [(set (match_operand:V2DF 0 "register_operand"     "=Y2,Y2,Y2,x,x")
 	(vec_concat:V2DF
@@ -2917,6 +4594,17 @@
   "TARGET_SSE2"
   "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
 
+(define_insn "*avx_<plusminus_insn><mode>3"
+  [(set (match_operand:SSEMODEI 0 "register_operand" "=x")
+	(plusminus:SSEMODEI
+	  (match_operand:SSEMODEI 1 "nonimmediate_operand" "<comm>x")
+	  (match_operand:SSEMODEI 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+  "vp<plusminus_mnemonic><ssevecsize>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "*<plusminus_insn><mode>3"
   [(set (match_operand:SSEMODEI 0 "register_operand" "=x")
 	(plusminus:SSEMODEI
@@ -2936,6 +4624,17 @@
   "TARGET_SSE2"
   "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
 
+(define_insn "*avx_<plusminus_insn><mode>3"
+  [(set (match_operand:SSEMODE12 0 "register_operand" "=x")
+	(sat_plusminus:SSEMODE12
+	  (match_operand:SSEMODE12 1 "nonimmediate_operand" "<comm>x")
+	  (match_operand:SSEMODE12 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+  "vp<plusminus_mnemonic><ssevecsize>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "*sse2_<plusminus_insn><mode>3"
   [(set (match_operand:SSEMODE12 0 "register_operand" "=x")
 	(sat_plusminus:SSEMODE12
@@ -3033,6 +4732,16 @@
   "TARGET_SSE2"
   "ix86_fixup_binary_operands_no_copy (MULT, V8HImode, operands);")
 
+(define_insn "*avx_mulv8hi3"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+	(mult:V8HI (match_operand:V8HI 1 "nonimmediate_operand" "%x")
+		   (match_operand:V8HI 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX && ix86_binary_operator_ok (MULT, V8HImode, operands)"
+  "vpmullw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseimul")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "*mulv8hi3"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
 	(mult:V8HI (match_operand:V8HI 1 "nonimmediate_operand" "%0")
@@ -3056,6 +4765,22 @@
   "TARGET_SSE2"
   "ix86_fixup_binary_operands_no_copy (MULT, V8HImode, operands);")
 
+(define_insn "*avxv8hi3_highpart"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+	(truncate:V8HI
+	  (lshiftrt:V8SI
+	    (mult:V8SI
+	      (sign_extend:V8SI
+		(match_operand:V8HI 1 "nonimmediate_operand" "%x"))
+	      (sign_extend:V8SI
+		(match_operand:V8HI 2 "nonimmediate_operand" "xm")))
+	    (const_int 16))))]
+  "TARGET_AVX && ix86_binary_operator_ok (MULT, V8HImode, operands)"
+  "vpmulhw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseimul")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "*smulv8hi3_highpart"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
 	(truncate:V8HI
@@ -3085,6 +4810,22 @@
   "TARGET_SSE2"
   "ix86_fixup_binary_operands_no_copy (MULT, V8HImode, operands);")
 
+(define_insn "*avx_umulv8hi3_highpart"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+	(truncate:V8HI
+	  (lshiftrt:V8SI
+	    (mult:V8SI
+	      (zero_extend:V8SI
+		(match_operand:V8HI 1 "nonimmediate_operand" "%x"))
+	      (zero_extend:V8SI
+		(match_operand:V8HI 2 "nonimmediate_operand" "xm")))
+	    (const_int 16))))]
+  "TARGET_AVX && ix86_binary_operator_ok (MULT, V8HImode, operands)"
+  "vpmulhuw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseimul")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "*umulv8hi3_highpart"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
 	(truncate:V8HI
@@ -3115,6 +4856,23 @@
   "TARGET_SSE2"
   "ix86_fixup_binary_operands_no_copy (MULT, V4SImode, operands);")
 
+(define_insn "*avx_umulv2siv2di3"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(mult:V2DI
+	  (zero_extend:V2DI
+	    (vec_select:V2SI
+	      (match_operand:V4SI 1 "nonimmediate_operand" "%x")
+	      (parallel [(const_int 0) (const_int 2)])))
+	  (zero_extend:V2DI
+	    (vec_select:V2SI
+	      (match_operand:V4SI 2 "nonimmediate_operand" "xm")
+	      (parallel [(const_int 0) (const_int 2)])))))]
+  "TARGET_AVX && ix86_binary_operator_ok (MULT, V4SImode, operands)"
+  "vpmuludq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseimul")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "*sse2_umulv2siv2di3"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
 	(mult:V2DI
@@ -3145,28 +4903,45 @@
 	      (parallel [(const_int 0) (const_int 2)])))))]
   "TARGET_SSE4_1"
   "ix86_fixup_binary_operands_no_copy (MULT, V4SImode, operands);")
- 
-(define_insn "*sse4_1_mulv2siv2di3"
+
+(define_insn "*avx_mulv2siv2di3"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
 	(mult:V2DI
 	  (sign_extend:V2DI
 	    (vec_select:V2SI
-	      (match_operand:V4SI 1 "nonimmediate_operand" "%0")
+	      (match_operand:V4SI 1 "nonimmediate_operand" "%x")
 	      (parallel [(const_int 0) (const_int 2)])))
 	  (sign_extend:V2DI
 	    (vec_select:V2SI
 	      (match_operand:V4SI 2 "nonimmediate_operand" "xm")
 	      (parallel [(const_int 0) (const_int 2)])))))]
-  "TARGET_SSE4_1 && ix86_binary_operator_ok (MULT, V4SImode, operands)"
-  "pmuldq\t{%2, %0|%0, %2}"
+  "TARGET_AVX && ix86_binary_operator_ok (MULT, V4SImode, operands)"
+  "vpmuldq\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "sseimul")
-   (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "vex")
    (set_attr "mode" "TI")])
 
-(define_expand "sse2_pmaddwd"
-  [(set (match_operand:V4SI 0 "register_operand" "")
-	(plus:V4SI
-	  (mult:V4SI
+(define_insn "*sse4_1_mulv2siv2di3"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(mult:V2DI
+	  (sign_extend:V2DI
+	    (vec_select:V2SI
+	      (match_operand:V4SI 1 "nonimmediate_operand" "%0")
+	      (parallel [(const_int 0) (const_int 2)])))
+	  (sign_extend:V2DI
+	    (vec_select:V2SI
+	      (match_operand:V4SI 2 "nonimmediate_operand" "xm")
+	      (parallel [(const_int 0) (const_int 2)])))))]
+  "TARGET_SSE4_1 && ix86_binary_operator_ok (MULT, V4SImode, operands)"
+  "pmuldq\t{%2, %0|%0, %2}"
+  [(set_attr "type" "sseimul")
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "TI")])
+
+(define_expand "sse2_pmaddwd"
+  [(set (match_operand:V4SI 0 "register_operand" "")
+	(plus:V4SI
+	  (mult:V4SI
 	    (sign_extend:V4SI
 	      (vec_select:V4HI
 		(match_operand:V8HI 1 "nonimmediate_operand" "")
@@ -3197,6 +4972,43 @@
   "TARGET_SSE2"
   "ix86_fixup_binary_operands_no_copy (MULT, V8HImode, operands);")
 
+(define_insn "*avx_pmaddwd"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(plus:V4SI
+	  (mult:V4SI
+	    (sign_extend:V4SI
+	      (vec_select:V4HI
+		(match_operand:V8HI 1 "nonimmediate_operand" "%x")
+		(parallel [(const_int 0)
+			   (const_int 2)
+			   (const_int 4)
+			   (const_int 6)])))
+	    (sign_extend:V4SI
+	      (vec_select:V4HI
+		(match_operand:V8HI 2 "nonimmediate_operand" "xm")
+		(parallel [(const_int 0)
+			   (const_int 2)
+			   (const_int 4)
+			   (const_int 6)]))))
+	  (mult:V4SI
+	    (sign_extend:V4SI
+	      (vec_select:V4HI (match_dup 1)
+		(parallel [(const_int 1)
+			   (const_int 3)
+			   (const_int 5)
+			   (const_int 7)])))
+	    (sign_extend:V4SI
+	      (vec_select:V4HI (match_dup 2)
+		(parallel [(const_int 1)
+			   (const_int 3)
+			   (const_int 5)
+			   (const_int 7)]))))))]
+  "TARGET_AVX && ix86_binary_operator_ok (MULT, V8HImode, operands)"
+  "vpmaddwd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "*sse2_pmaddwd"
   [(set (match_operand:V4SI 0 "register_operand" "=x")
 	(plus:V4SI
@@ -3244,6 +5056,16 @@
     ix86_fixup_binary_operands_no_copy (MULT, V4SImode, operands);
 })
 
+(define_insn "*avx_mulv4si3"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(mult:V4SI (match_operand:V4SI 1 "nonimmediate_operand" "%x")
+		   (match_operand:V4SI 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX && ix86_binary_operator_ok (MULT, V4SImode, operands)"
+  "vpmulld\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseimul")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "*sse4_1_mulv4si3"
   [(set (match_operand:V4SI 0 "register_operand" "=x")
 	(mult:V4SI (match_operand:V4SI 1 "nonimmediate_operand" "%0")
@@ -3627,6 +5449,17 @@
   DONE;
 })
 
+(define_insn "*avx_ashr<mode>3"
+  [(set (match_operand:SSEMODE24 0 "register_operand" "=x")
+	(ashiftrt:SSEMODE24
+	  (match_operand:SSEMODE24 1 "register_operand" "x")
+	  (match_operand:SI 2 "nonmemory_operand" "xN")))]
+  "TARGET_AVX"
+  "vpsra<ssevecsize>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseishft")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "ashr<mode>3"
   [(set (match_operand:SSEMODE24 0 "register_operand" "=x")
 	(ashiftrt:SSEMODE24
@@ -3638,6 +5471,17 @@
    (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
+(define_insn "*avx_lshr<mode>3"
+  [(set (match_operand:SSEMODE248 0 "register_operand" "=x")
+	(lshiftrt:SSEMODE248
+	  (match_operand:SSEMODE248 1 "register_operand" "x")
+	  (match_operand:SI 2 "nonmemory_operand" "xN")))]
+  "TARGET_AVX"
+  "vpsrl<ssevecsize>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseishft")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "lshr<mode>3"
   [(set (match_operand:SSEMODE248 0 "register_operand" "=x")
 	(lshiftrt:SSEMODE248
@@ -3649,6 +5493,17 @@
    (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
+(define_insn "*avx_ashl<mode>3"
+  [(set (match_operand:SSEMODE248 0 "register_operand" "=x")
+	(ashift:SSEMODE248
+	  (match_operand:SSEMODE248 1 "register_operand" "x")
+	  (match_operand:SI 2 "nonmemory_operand" "xN")))]
+  "TARGET_AVX"
+  "vpsll<ssevecsize>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseishft")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "ashl<mode>3"
   [(set (match_operand:SSEMODE248 0 "register_operand" "=x")
 	(ashift:SSEMODE248
@@ -3680,6 +5535,17 @@
   operands[1] = gen_lowpart (TImode, operands[1]);
 })
 
+(define_insn "*avx_<code><mode>3"
+  [(set (match_operand:SSEMODE124 0 "register_operand" "=x")
+	(maxmin:SSEMODE124
+	  (match_operand:SSEMODE124 1 "nonimmediate_operand" "%x")
+	  (match_operand:SSEMODE124 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+  "vp<maxminiprefix><ssevecsize>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_expand "<code>v16qi3"
   [(set (match_operand:V16QI 0 "register_operand" "")
 	(umaxmin:V16QI
@@ -3873,6 +5739,17 @@
   "TARGET_SSE2 && !TARGET_SSE5"
   "ix86_fixup_binary_operands_no_copy (EQ, <MODE>mode, operands);")
 
+(define_insn "*avx_eq<mode>3"
+  [(set (match_operand:SSEMODE1248 0 "register_operand" "=x")
+	(eq:SSEMODE1248
+	  (match_operand:SSEMODE1248 1 "nonimmediate_operand" "%x")
+	  (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX && ix86_binary_operator_ok (EQ, <MODE>mode, operands)"
+  "vpcmpeq<ssevecsize>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "*sse2_eq<mode>3"
   [(set (match_operand:SSEMODE124 0 "register_operand" "=x")
 	(eq:SSEMODE124
@@ -3904,6 +5781,17 @@
    (set_attr "prefix_extra" "1")
    (set_attr "mode" "TI")])
 
+(define_insn "*avx_gt<mode>3"
+  [(set (match_operand:SSEMODE1248 0 "register_operand" "=x")
+	(gt:SSEMODE1248
+	  (match_operand:SSEMODE1248 1 "register_operand" "x")
+	  (match_operand:SSEMODE1248 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX"
+  "vpcmpgt<ssevecsize>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssecmp")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "sse2_gt<mode>3"
   [(set (match_operand:SSEMODE124 0 "register_operand" "=x")
 	(gt:SSEMODE124
@@ -3978,6 +5866,17 @@
   operands[2] = force_reg (<MODE>mode, gen_rtx_CONST_VECTOR (<MODE>mode, v));
 })
 
+(define_insn "*avx_nand<mode>3"
+  [(set (match_operand:AVX256MODEI 0 "register_operand" "=x")
+	(and:AVX256MODEI
+	  (not:AVX256MODEI (match_operand:AVX256MODEI 1 "register_operand" "x"))
+          (match_operand:AVX256MODEI 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX"
+  "vandnps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<avxvecpsmode>")])
+
 (define_insn "*sse_nand<mode>3"
   [(set (match_operand:SSEMODEI 0 "register_operand" "=x")
 	(and:SSEMODEI
@@ -3988,6 +5887,17 @@
   [(set_attr "type" "sselog")
    (set_attr "mode" "V4SF")])
 
+(define_insn "*avx_nand<mode>3"
+  [(set (match_operand:SSEMODEI 0 "register_operand" "=x")
+	(and:SSEMODEI
+	  (not:SSEMODEI (match_operand:SSEMODEI 1 "register_operand" "x"))
+	  (match_operand:SSEMODEI 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX"
+  "vpandn\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "sse2_nand<mode>3"
   [(set (match_operand:SSEMODEI 0 "register_operand" "=x")
 	(and:SSEMODEI
@@ -4018,6 +5928,18 @@
   "TARGET_SSE"
   "ix86_fixup_binary_operands_no_copy (<CODE>, <MODE>mode, operands);")
 
+(define_insn "*avx_<code><mode>3"
+  [(set (match_operand:AVX256MODEI 0 "register_operand" "=x")
+        (plogic:AVX256MODEI
+          (match_operand:AVX256MODEI 1 "nonimmediate_operand" "%x")
+          (match_operand:AVX256MODEI 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX
+   && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+  "v<plogicprefix>ps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<avxvecpsmode>")])
+
 (define_insn "*sse_<code><mode>3"
   [(set (match_operand:SSEMODEI 0 "register_operand" "=x")
         (plogic:SSEMODEI
@@ -4029,6 +5951,18 @@
   [(set_attr "type" "sselog")
    (set_attr "mode" "V4SF")])
 
+(define_insn "*avx_<code><mode>3"
+  [(set (match_operand:SSEMODEI 0 "register_operand" "=x")
+        (plogic:SSEMODEI
+          (match_operand:SSEMODEI 1 "nonimmediate_operand" "%x")
+          (match_operand:SSEMODEI 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX
+   && ix86_binary_operator_ok (<CODE>, <MODE>mode, operands)"
+  "vp<plogicprefix>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "*sse2_<code><mode>3"
   [(set (match_operand:SSEMODEI 0 "register_operand" "=x")
 	(plogic:SSEMODEI
@@ -4344,6 +6278,19 @@
                      (const_int 2)])))]
   "TARGET_SSE2")
 
+(define_insn "*avx_packsswb"
+  [(set (match_operand:V16QI 0 "register_operand" "=x")
+	(vec_concat:V16QI
+	  (ss_truncate:V8QI
+	    (match_operand:V8HI 1 "register_operand" "x"))
+	  (ss_truncate:V8QI
+	    (match_operand:V8HI 2 "nonimmediate_operand" "xm"))))]
+  "TARGET_AVX"
+  "vpacksswb\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "sse2_packsswb"
   [(set (match_operand:V16QI 0 "register_operand" "=x")
 	(vec_concat:V16QI
@@ -4357,6 +6304,19 @@
    (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
+(define_insn "*avx_packssdw"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+	(vec_concat:V8HI
+	  (ss_truncate:V4HI
+	    (match_operand:V4SI 1 "register_operand" "x"))
+	  (ss_truncate:V4HI
+	    (match_operand:V4SI 2 "nonimmediate_operand" "xm"))))]
+  "TARGET_AVX"
+  "vpackssdw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "sse2_packssdw"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
 	(vec_concat:V8HI
@@ -4370,6 +6330,19 @@
    (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
+(define_insn "*avx_packuswb"
+  [(set (match_operand:V16QI 0 "register_operand" "=x")
+	(vec_concat:V16QI
+	  (us_truncate:V8QI
+	    (match_operand:V8HI 1 "register_operand" "x"))
+	  (us_truncate:V8QI
+	    (match_operand:V8HI 2 "nonimmediate_operand" "xm"))))]
+  "TARGET_AVX"
+  "vpackuswb\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "sse2_packuswb"
   [(set (match_operand:V16QI 0 "register_operand" "=x")
 	(vec_concat:V16QI
@@ -4383,6 +6356,26 @@
    (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
+(define_insn "*avx_punpckhbw"
+  [(set (match_operand:V16QI 0 "register_operand" "=x")
+	(vec_select:V16QI
+	  (vec_concat:V32QI
+	    (match_operand:V16QI 1 "register_operand" "x")
+	    (match_operand:V16QI 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 8)  (const_int 24)
+		     (const_int 9)  (const_int 25)
+		     (const_int 10) (const_int 26)
+		     (const_int 11) (const_int 27)
+		     (const_int 12) (const_int 28)
+		     (const_int 13) (const_int 29)
+		     (const_int 14) (const_int 30)
+		     (const_int 15) (const_int 31)])))]
+  "TARGET_AVX"
+  "vpunpckhbw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "sse2_punpckhbw"
   [(set (match_operand:V16QI 0 "register_operand" "=x")
 	(vec_select:V16QI
@@ -4403,6 +6396,26 @@
    (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
+(define_insn "*avx_punpcklbw"
+  [(set (match_operand:V16QI 0 "register_operand" "=x")
+	(vec_select:V16QI
+	  (vec_concat:V32QI
+	    (match_operand:V16QI 1 "register_operand" "x")
+	    (match_operand:V16QI 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 0) (const_int 16)
+		     (const_int 1) (const_int 17)
+		     (const_int 2) (const_int 18)
+		     (const_int 3) (const_int 19)
+		     (const_int 4) (const_int 20)
+		     (const_int 5) (const_int 21)
+		     (const_int 6) (const_int 22)
+		     (const_int 7) (const_int 23)])))]
+  "TARGET_AVX"
+  "vpunpcklbw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "sse2_punpcklbw"
   [(set (match_operand:V16QI 0 "register_operand" "=x")
 	(vec_select:V16QI
@@ -4423,6 +6436,22 @@
    (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
+(define_insn "*avx_punpckhwd"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+	(vec_select:V8HI
+	  (vec_concat:V16HI
+	    (match_operand:V8HI 1 "register_operand" "x")
+	    (match_operand:V8HI 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 4) (const_int 12)
+		     (const_int 5) (const_int 13)
+		     (const_int 6) (const_int 14)
+		     (const_int 7) (const_int 15)])))]
+  "TARGET_AVX"
+  "vpunpckhwd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "sse2_punpckhwd"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
 	(vec_select:V8HI
@@ -4439,6 +6468,22 @@
    (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
+(define_insn "*avx_punpcklwd"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+	(vec_select:V8HI
+	  (vec_concat:V16HI
+	    (match_operand:V8HI 1 "register_operand" "x")
+	    (match_operand:V8HI 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 0) (const_int 8)
+		     (const_int 1) (const_int 9)
+		     (const_int 2) (const_int 10)
+		     (const_int 3) (const_int 11)])))]
+  "TARGET_AVX"
+  "vpunpcklwd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "sse2_punpcklwd"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
 	(vec_select:V8HI
@@ -4455,6 +6500,20 @@
    (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
+(define_insn "*avx_punpckhdq"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(vec_select:V4SI
+	  (vec_concat:V8SI
+	    (match_operand:V4SI 1 "register_operand" "x")
+	    (match_operand:V4SI 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 2) (const_int 6)
+		     (const_int 3) (const_int 7)])))]
+  "TARGET_AVX"
+  "vpunpckhdq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "sse2_punpckhdq"
   [(set (match_operand:V4SI 0 "register_operand" "=x")
 	(vec_select:V4SI
@@ -4469,6 +6528,20 @@
    (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
+(define_insn "*avx_punpckldq"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(vec_select:V4SI
+	  (vec_concat:V8SI
+	    (match_operand:V4SI 1 "register_operand" "x")
+	    (match_operand:V4SI 2 "nonimmediate_operand" "xm"))
+	  (parallel [(const_int 0) (const_int 4)
+		     (const_int 1) (const_int 5)])))]
+  "TARGET_AVX"
+  "vpunpckldq\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "sse2_punpckldq"
   [(set (match_operand:V4SI 0 "register_operand" "=x")
 	(vec_select:V4SI
@@ -4483,6 +6556,22 @@
    (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
+(define_insn "*avx_pinsr<avxmodesuffixs>"
+  [(set (match_operand:SSEMODE124 0 "register_operand" "=x")
+	(vec_merge:SSEMODE124
+	  (vec_duplicate:SSEMODE124
+	    (match_operand:<avxscalarmode> 2 "nonimmediate_operand" "rm"))
+	  (match_operand:SSEMODE124 1 "register_operand" "x")
+	  (match_operand:SI 3 "const_pow2_1_to_<pinsrbits>_operand" "n")))]
+  "TARGET_AVX"
+{
+  operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3])));
+  return "vpinsr<avxmodesuffixs>\t{%3, %k2, %1, %0|%0, %1, %k2, %3}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "*sse4_1_pinsrb"
   [(set (match_operand:V16QI 0 "register_operand" "=x")
 	(vec_merge:V16QI
@@ -4532,6 +6621,22 @@
    (set_attr "prefix_extra" "1")
    (set_attr "mode" "TI")])
 
+(define_insn "*avx_pinsrq"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(vec_merge:V2DI
+	  (vec_duplicate:V2DI
+	    (match_operand:DI 2 "nonimmediate_operand" "rm"))
+	  (match_operand:V2DI 1 "register_operand" "x")
+	  (match_operand:SI 3 "const_pow2_1_to_2_operand" "n")))]
+  "TARGET_AVX && TARGET_64BIT"
+{
+  operands[3] = GEN_INT (exact_log2 (INTVAL (operands[3])));
+  return "vpinsrq\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+}
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "*sse4_1_pinsrq"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
 	(vec_merge:V2DI
@@ -4555,9 +6660,10 @@
 	    (match_operand:V16QI 1 "register_operand" "x")
 	    (parallel [(match_operand:SI 2 "const_0_to_15_operand" "n")]))))]
   "TARGET_SSE4_1"
-  "pextrb\t{%2, %1, %0|%0, %1, %2}"
+  "%vpextrb\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "sselog")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "*sse4_1_pextrb_memory"
@@ -4566,9 +6672,10 @@
 	  (match_operand:V16QI 1 "register_operand" "x")
 	  (parallel [(match_operand:SI 2 "const_0_to_15_operand" "n")])))]
   "TARGET_SSE4_1"
-  "pextrb\t{%2, %1, %0|%0, %1, %2}"
+  "%vpextrb\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "sselog")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "*sse2_pextrw"
@@ -4578,9 +6685,10 @@
 	    (match_operand:V8HI 1 "register_operand" "x")
 	    (parallel [(match_operand:SI 2 "const_0_to_7_operand" "n")]))))]
   "TARGET_SSE2"
-  "pextrw\t{%2, %1, %0|%0, %1, %2}"
+  "%vpextrw\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "sselog")
    (set_attr "prefix_data16" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "*sse4_1_pextrw_memory"
@@ -4589,9 +6697,10 @@
 	  (match_operand:V8HI 1 "register_operand" "x")
 	  (parallel [(match_operand:SI 2 "const_0_to_7_operand" "n")])))]
   "TARGET_SSE4_1"
-  "pextrw\t{%2, %1, %0|%0, %1, %2}"
+  "%vpextrw\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "sselog")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "*sse4_1_pextrd"
@@ -4600,9 +6709,10 @@
 	  (match_operand:V4SI 1 "register_operand" "x")
 	  (parallel [(match_operand:SI 2 "const_0_to_3_operand" "n")])))]
   "TARGET_SSE4_1"
-  "pextrd\t{%2, %1, %0|%0, %1, %2}"
+  "%vpextrd\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "sselog")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 ;; It must come before *vec_extractv2di_1_sse since it is preferred.
@@ -4612,9 +6722,10 @@
 	  (match_operand:V2DI 1 "register_operand" "x")
 	  (parallel [(match_operand:SI 2 "const_0_to_1_operand" "n")])))]
   "TARGET_SSE4_1 && TARGET_64BIT"
-  "pextrq\t{%2, %1, %0|%0, %1, %2}"
+  "%vpextrq\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "sselog")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_expand "sse2_pshufd"
@@ -4649,10 +6760,11 @@
   mask |= INTVAL (operands[5]) << 6;
   operands[2] = GEN_INT (mask);
 
-  return "pshufd\t{%2, %1, %0|%0, %1, %2}";
+  return "%vpshufd\t{%2, %1, %0|%0, %1, %2}";
 }
   [(set_attr "type" "sselog1")
    (set_attr "prefix_data16" "1")
+   (set_attr "prefix" "vex")
    (set_attr "mode" "TI")])
 
 (define_expand "sse2_pshuflw"
@@ -4691,10 +6803,11 @@
   mask |= INTVAL (operands[5]) << 6;
   operands[2] = GEN_INT (mask);
 
-  return "pshuflw\t{%2, %1, %0|%0, %1, %2}";
+  return "%vpshuflw\t{%2, %1, %0|%0, %1, %2}";
 }
   [(set_attr "type" "sselog")
    (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_expand "sse2_pshufhw"
@@ -4733,10 +6846,11 @@
   mask |= (INTVAL (operands[5]) - 4) << 6;
   operands[2] = GEN_INT (mask);
 
-  return "pshufhw\t{%2, %1, %0|%0, %1, %2}";
+  return "%vpshufhw\t{%2, %1, %0|%0, %1, %2}";
 }
   [(set_attr "type" "sselog")
    (set_attr "prefix_rep" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_expand "sse2_loadd"
@@ -4749,6 +6863,22 @@
   "TARGET_SSE"
   "operands[2] = CONST0_RTX (V4SImode);")
 
+(define_insn "*avx_loadld"
+  [(set (match_operand:V4SI 0 "register_operand"       "=x,Yi,x")
+	(vec_merge:V4SI
+	  (vec_duplicate:V4SI
+	    (match_operand:SI 2 "nonimmediate_operand" "m ,r ,x"))
+	  (match_operand:V4SI 1 "reg_or_0_operand"     "C ,C ,x")
+	  (const_int 1)))]
+  "TARGET_AVX"
+  "@
+   vmovd\t{%2, %0|%0, %2}
+   vmovd\t{%2, %0|%0, %2}
+   vmovss\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI,TI,V4SF")])
+
 (define_insn "sse2_loadld"
   [(set (match_operand:V4SI 0 "register_operand"       "=Y2,Yi,x,x")
 	(vec_merge:V4SI
@@ -4814,8 +6944,9 @@
   "@
    #
    #
-   mov{q}\t{%1, %0|%0, %1}"
+   %vmov{q}\t{%1, %0|%0, %1}"
   [(set_attr "type" "*,*,imov")
+   (set_attr "prefix" "*,*,maybe_vex")
    (set_attr "mode" "*,*,DI")])
 
 (define_insn "*sse2_storeq"
@@ -4841,6 +6972,24 @@
   operands[1] = gen_rtx_REG (DImode, REGNO (operands[1]));
 })
 
+(define_insn "*vec_extractv2di_1_rex64_avx"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=m,x,x,r")
+	(vec_select:DI
+	  (match_operand:V2DI 1 "nonimmediate_operand" "x,x,o,o")
+	  (parallel [(const_int 1)])))]
+  "TARGET_64BIT
+   && TARGET_AVX
+   && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "@
+   vmovhps\t{%1, %0|%0, %1}
+   vpsrldq\t{$8, %1, %0|%0, %1, 8}
+   vmovq\t{%H1, %0|%0, %H1}
+   vmov{q}\t{%H1, %0|%0, %H1}"
+  [(set_attr "type" "ssemov,sseishft,ssemov,imov")
+   (set_attr "memory" "*,none,*,*")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V2SF,TI,TI,DI")])
+
 (define_insn "*vec_extractv2di_1_rex64"
   [(set (match_operand:DI 0 "nonimmediate_operand" "=m,x,x,r")
 	(vec_select:DI
@@ -4856,6 +7005,23 @@
    (set_attr "memory" "*,none,*,*")
    (set_attr "mode" "V2SF,TI,TI,DI")])
 
+(define_insn "*vec_extractv2di_1_avx"
+  [(set (match_operand:DI 0 "nonimmediate_operand" "=m,x,x")
+	(vec_select:DI
+	  (match_operand:V2DI 1 "nonimmediate_operand" "x,x,o")
+	  (parallel [(const_int 1)])))]
+  "!TARGET_64BIT
+   && TARGET_AVX
+   && !(MEM_P (operands[0]) && MEM_P (operands[1]))"
+  "@
+   vmovhps\t{%1, %0|%0, %1}
+   vpsrldq\t{$8, %1, %0|%0, %1, 8}
+   vmovq\t{%H1, %0|%0, %H1}"
+  [(set_attr "type" "ssemov,sseishft,ssemov")
+   (set_attr "memory" "*,none,*")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V2SF,TI,TI")])
+
 (define_insn "*vec_extractv2di_1_sse2"
   [(set (match_operand:DI 0 "nonimmediate_operand" "=m,x,x")
 	(vec_select:DI
@@ -4892,15 +7058,26 @@
 	  (match_operand:SI 1 "register_operand" " Y2,0")))]
   "TARGET_SSE"
   "@
-   pshufd\t{$0, %1, %0|%0, %1, 0}
+   %vpshufd\t{$0, %1, %0|%0, %1, 0}
    shufps\t{$0, %0, %0|%0, %0, 0}"
   [(set_attr "type" "sselog1")
+   (set_attr "prefix" "maybe_vex,orig")
    (set_attr "mode" "TI,V4SF")])
 
-(define_insn "*vec_dupv2di"
-  [(set (match_operand:V2DI 0 "register_operand" "=Y2,x")
+(define_insn "*vec_dupv2di_avx"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
 	(vec_duplicate:V2DI
-	  (match_operand:DI 1 "register_operand" " 0 ,0")))]
+	  (match_operand:DI 1 "register_operand" "x")))]
+  "TARGET_AVX"
+  "vpunpcklqdq\t{%1, %1, %0|%0, %1, %1}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "*vec_dupv2di"
+  [(set (match_operand:V2DI 0 "register_operand" "=Y2,x")
+	(vec_duplicate:V2DI
+	  (match_operand:DI 1 "register_operand" " 0 ,0")))]
   "TARGET_SSE"
   "@
    punpcklqdq\t%0, %0
@@ -4908,6 +7085,25 @@
   [(set_attr "type" "sselog1,ssemov")
    (set_attr "mode" "TI,V4SF")])
 
+(define_insn "*vec_concatv2si_avx"
+  [(set (match_operand:V2SI 0 "register_operand"     "=x,x,x ,*y ,*y")
+	(vec_concat:V2SI
+	  (match_operand:SI 1 "nonimmediate_operand" "x ,x,rm, 0 ,rm")
+	  (match_operand:SI 2 "vector_move_operand"  "rm,x,C ,*ym,C")))]
+  "TARGET_AVX"
+  "@
+   vpinsrd\t{$0x1, %2, %1, %0|%0, %1, %2, 0x1}
+   vpunpckldq\t{%2, %1, %0|%0, %1, %2}
+   vmovd\t{%1, %0|%0, %1}
+   punpckldq\t{%2, %0|%0, %2}
+   movd\t{%1, %0|%0, %1}"
+  [(set_attr "type" "sselog,sselog,ssemov,mmxcvt,mmxmov")
+   (set (attr "prefix")
+     (if_then_else (eq_attr "alternative" "3,4")
+       (const_string "orig")
+       (const_string "vex")))
+   (set_attr "mode" "TI,TI,TI,DI,DI")])
+
 (define_insn "*vec_concatv2si_sse4_1"
   [(set (match_operand:V2SI 0 "register_operand"     "=x,x,x ,*y ,*y")
 	(vec_concat:V2SI
@@ -4955,6 +7151,19 @@
   [(set_attr "type" "sselog,ssemov,mmxcvt,mmxmov")
    (set_attr "mode" "V4SF,V4SF,DI,DI")])
 
+(define_insn "*vec_concatv4si_1_avx"
+  [(set (match_operand:V4SI 0 "register_operand"       "=x,x")
+	(vec_concat:V4SI
+	  (match_operand:V2SI 1 "register_operand"     " x,x")
+	  (match_operand:V2SI 2 "nonimmediate_operand" " x,m")))]
+  "TARGET_AVX"
+  "@
+   vpunpcklqdq\t{%2, %1, %0|%0, %1, %2}
+   vmovhps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog,ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI,V2SF")])
+
 (define_insn "*vec_concatv4si_1"
   [(set (match_operand:V4SI 0 "register_operand"       "=Y2,x,x")
 	(vec_concat:V4SI
@@ -4968,6 +7177,24 @@
   [(set_attr "type" "sselog,ssemov,ssemov")
    (set_attr "mode" "TI,V4SF,V2SF")])
 
+(define_insn "*vec_concatv2di_avx"
+  [(set (match_operand:V2DI 0 "register_operand"     "=x,?x,x,x")
+	(vec_concat:V2DI
+	  (match_operand:DI 1 "nonimmediate_operand" " m,*y,x,x")
+	  (match_operand:DI 2 "vector_move_operand"  " C, C,x,m")))]
+  "!TARGET_64BIT && TARGET_AVX"
+  "@
+   vmovq\t{%1, %0|%0, %1}
+   movq2dq\t{%1, %0|%0, %1}
+   vpunpcklqdq\t{%2, %1, %0|%0, %1, %2}
+   vmovhps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssemov,ssemov,sselog,ssemov")
+   (set (attr "prefix")
+     (if_then_else (eq_attr "alternative" "1")
+       (const_string "orig")
+       (const_string "vex")))
+   (set_attr "mode" "TI,TI,TI,V2SF")])
+
 (define_insn "vec_concatv2di"
   [(set (match_operand:V2DI 0 "register_operand"     "=Y2 ,?Y2,Y2,x,x")
 	(vec_concat:V2DI
@@ -4983,6 +7210,26 @@
   [(set_attr "type" "ssemov,ssemov,sselog,ssemov,ssemov")
    (set_attr "mode" "TI,TI,TI,V4SF,V2SF")])
 
+(define_insn "*vec_concatv2di_rex64_avx"
+  [(set (match_operand:V2DI 0 "register_operand"     "=x,x,Yi,!x,x,x")
+	(vec_concat:V2DI
+	  (match_operand:DI 1 "nonimmediate_operand" " x,m,r ,*y,x,x")
+	  (match_operand:DI 2 "vector_move_operand"  "rm,C,C ,C ,x,m")))]
+  "TARGET_64BIT && TARGET_AVX"
+  "@
+   vpinsrq\t{$0x1, %2, %1, %0|%0, %1, %2, 0x1}
+   vmovq\t{%1, %0|%0, %1}
+   vmovq\t{%1, %0|%0, %1}
+   movq2dq\t{%1, %0|%0, %1}
+   vpunpcklqdq\t{%2, %1, %0|%0, %1, %2}
+   vmovhps\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog,ssemov,ssemov,ssemov,sselog,ssemov")
+   (set (attr "prefix")
+     (if_then_else (eq_attr "alternative" "3")
+       (const_string "orig")
+       (const_string "vex")))
+   (set_attr "mode" "TI,TI,TI,TI,TI,V2SF")])
+
 (define_insn "*vec_concatv2di_rex64_sse4_1"
   [(set (match_operand:V2DI 0 "register_operand"     "=x ,x ,Yi,!x,x,x,x")
 	(vec_concat:V2DI
@@ -5213,6 +7460,31 @@
   "TARGET_SSE2"
   "ix86_fixup_binary_operands_no_copy (PLUS, V16QImode, operands);")
 
+(define_insn "*avx_uavgv16qi3"
+  [(set (match_operand:V16QI 0 "register_operand" "=x")
+	(truncate:V16QI
+	  (lshiftrt:V16HI
+	    (plus:V16HI
+	      (plus:V16HI
+		(zero_extend:V16HI
+		  (match_operand:V16QI 1 "nonimmediate_operand" "%x"))
+		(zero_extend:V16HI
+		  (match_operand:V16QI 2 "nonimmediate_operand" "xm")))
+	      (const_vector:V16QI [(const_int 1) (const_int 1)
+				   (const_int 1) (const_int 1)
+				   (const_int 1) (const_int 1)
+				   (const_int 1) (const_int 1)
+				   (const_int 1) (const_int 1)
+				   (const_int 1) (const_int 1)
+				   (const_int 1) (const_int 1)
+				   (const_int 1) (const_int 1)]))
+	    (const_int 1))))]
+  "TARGET_AVX && ix86_binary_operator_ok (PLUS, V16QImode, operands)"
+  "vpavgb\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "*sse2_uavgv16qi3"
   [(set (match_operand:V16QI 0 "register_operand" "=x")
 	(truncate:V16QI
@@ -5256,6 +7528,27 @@
   "TARGET_SSE2"
   "ix86_fixup_binary_operands_no_copy (PLUS, V8HImode, operands);")
 
+(define_insn "*avx_uavgv8hi3"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+	(truncate:V8HI
+	  (lshiftrt:V8SI
+	    (plus:V8SI
+	      (plus:V8SI
+		(zero_extend:V8SI
+		  (match_operand:V8HI 1 "nonimmediate_operand" "%x"))
+		(zero_extend:V8SI
+		  (match_operand:V8HI 2 "nonimmediate_operand" "xm")))
+	      (const_vector:V8HI [(const_int 1) (const_int 1)
+				  (const_int 1) (const_int 1)
+				  (const_int 1) (const_int 1)
+				  (const_int 1) (const_int 1)]))
+	    (const_int 1))))]
+  "TARGET_AVX && ix86_binary_operator_ok (PLUS, V8HImode, operands)"
+  "vpavgw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "*sse2_uavgv8hi3"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
 	(truncate:V8HI
@@ -5279,6 +7572,17 @@
 
 ;; The correct representation for this is absolutely enormous, and
 ;; surely not generally useful.
+(define_insn "*avx_psadbw"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(unspec:V2DI [(match_operand:V16QI 1 "register_operand" "x")
+		      (match_operand:V16QI 2 "nonimmediate_operand" "xm")]
+		     UNSPEC_PSADBW))]
+  "TARGET_AVX"
+  "vpsadbw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "sse2_psadbw"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
 	(unspec:V2DI [(match_operand:V16QI 1 "register_operand" "0")
@@ -5290,14 +7594,26 @@
    (set_attr "prefix_data16" "1")
    (set_attr "mode" "TI")])
 
+(define_insn "avx_movmskp<avxmodesuffixf2c>256"
+  [(set (match_operand:SI 0 "register_operand" "=r")
+	(unspec:SI
+	  [(match_operand:AVX256MODEF2P 1 "register_operand" "x")]
+	  UNSPEC_MOVMSK))]
+  "AVX256_VEC_FLOAT_MODE_P (<MODE>mode)"
+  "vmovmskp<avxmodesuffixf2c>\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "<sse>_movmskp<ssemodesuffixf2c>"
   [(set (match_operand:SI 0 "register_operand" "=r")
 	(unspec:SI
 	  [(match_operand:SSEMODEF2P 1 "register_operand" "x")]
 	  UNSPEC_MOVMSK))]
   "SSE_VEC_FLOAT_MODE_P (<MODE>mode)"
-  "movmskp<ssemodesuffixf2c>\t{%1, %0|%0, %1}"
+  "%vmovmskp<ssemodesuffixf2c>\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "<MODE>")])
 
 (define_insn "sse2_pmovmskb"
@@ -5305,9 +7621,10 @@
 	(unspec:SI [(match_operand:V16QI 1 "register_operand" "x")]
 		   UNSPEC_MOVMSK))]
   "TARGET_SSE2"
-  "pmovmskb\t{%1, %0|%0, %1}"
+  "%vpmovmskb\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
    (set_attr "prefix_data16" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "SI")])
 
 (define_expand "sse2_maskmovdqu"
@@ -5327,9 +7644,10 @@
 		      UNSPEC_MASKMOV))]
   "TARGET_SSE2 && !TARGET_64BIT"
   ;; @@@ check ordering of operands in intel/nonintel syntax
-  "maskmovdqu\t{%2, %1|%1, %2}"
+  "%vmaskmovdqu\t{%2, %1|%1, %2}"
   [(set_attr "type" "ssecvt")
    (set_attr "prefix_data16" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "*sse2_maskmovdqu_rex64"
@@ -5340,25 +7658,28 @@
 		      UNSPEC_MASKMOV))]
   "TARGET_SSE2 && TARGET_64BIT"
   ;; @@@ check ordering of operands in intel/nonintel syntax
-  "maskmovdqu\t{%2, %1|%1, %2}"
+  "%vmaskmovdqu\t{%2, %1|%1, %2}"
   [(set_attr "type" "ssecvt")
    (set_attr "prefix_data16" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "sse_ldmxcsr"
   [(unspec_volatile [(match_operand:SI 0 "memory_operand" "m")]
 		    UNSPECV_LDMXCSR)]
   "TARGET_SSE"
-  "ldmxcsr\t%0"
+  "%vldmxcsr\t%0"
   [(set_attr "type" "sse")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "memory" "load")])
 
 (define_insn "sse_stmxcsr"
   [(set (match_operand:SI 0 "memory_operand" "=m")
 	(unspec_volatile:SI [(const_int 0)] UNSPECV_STMXCSR))]
   "TARGET_SSE"
-  "stmxcsr\t%0"
+  "%vstmxcsr\t%0"
   [(set_attr "type" "sse")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "memory" "store")])
 
 (define_expand "sse_sfence"
@@ -5458,6 +7779,49 @@
 ;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+(define_insn "*avx_phaddwv8hi3"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+	(vec_concat:V8HI
+	  (vec_concat:V4HI
+	    (vec_concat:V2HI
+	      (plus:HI
+		(vec_select:HI
+		  (match_operand:V8HI 1 "register_operand" "x")
+		  (parallel [(const_int 0)]))
+		(vec_select:HI (match_dup 1) (parallel [(const_int 1)])))
+	      (plus:HI
+		(vec_select:HI (match_dup 1) (parallel [(const_int 2)]))
+		(vec_select:HI (match_dup 1) (parallel [(const_int 3)]))))
+	    (vec_concat:V2HI
+	      (plus:HI
+		(vec_select:HI (match_dup 1) (parallel [(const_int 4)]))
+		(vec_select:HI (match_dup 1) (parallel [(const_int 5)])))
+	      (plus:HI
+		(vec_select:HI (match_dup 1) (parallel [(const_int 6)]))
+		(vec_select:HI (match_dup 1) (parallel [(const_int 7)])))))
+	  (vec_concat:V4HI
+	    (vec_concat:V2HI
+	      (plus:HI
+		(vec_select:HI
+		  (match_operand:V8HI 2 "nonimmediate_operand" "xm")
+		  (parallel [(const_int 0)]))
+		(vec_select:HI (match_dup 2) (parallel [(const_int 1)])))
+	      (plus:HI
+		(vec_select:HI (match_dup 2) (parallel [(const_int 2)]))
+		(vec_select:HI (match_dup 2) (parallel [(const_int 3)]))))
+	    (vec_concat:V2HI
+	      (plus:HI
+		(vec_select:HI (match_dup 2) (parallel [(const_int 4)]))
+		(vec_select:HI (match_dup 2) (parallel [(const_int 5)])))
+	      (plus:HI
+		(vec_select:HI (match_dup 2) (parallel [(const_int 6)]))
+		(vec_select:HI (match_dup 2) (parallel [(const_int 7)])))))))]
+  "TARGET_AVX"
+  "vphaddw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "ssse3_phaddwv8hi3"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
 	(vec_concat:V8HI
@@ -5529,6 +7893,33 @@
    (set_attr "prefix_extra" "1")
    (set_attr "mode" "DI")])
 
+(define_insn "*avx_phadddv4si3"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(vec_concat:V4SI
+	  (vec_concat:V2SI
+	    (plus:SI
+	      (vec_select:SI
+		(match_operand:V4SI 1 "register_operand" "x")
+		(parallel [(const_int 0)]))
+	      (vec_select:SI (match_dup 1) (parallel [(const_int 1)])))
+	    (plus:SI
+	      (vec_select:SI (match_dup 1) (parallel [(const_int 2)]))
+	      (vec_select:SI (match_dup 1) (parallel [(const_int 3)]))))
+	  (vec_concat:V2SI
+	    (plus:SI
+	      (vec_select:SI
+		(match_operand:V4SI 2 "nonimmediate_operand" "xm")
+		(parallel [(const_int 0)]))
+	      (vec_select:SI (match_dup 2) (parallel [(const_int 1)])))
+	    (plus:SI
+	      (vec_select:SI (match_dup 2) (parallel [(const_int 2)]))
+	      (vec_select:SI (match_dup 2) (parallel [(const_int 3)]))))))]
+  "TARGET_AVX"
+  "vphaddd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "ssse3_phadddv4si3"
   [(set (match_operand:V4SI 0 "register_operand" "=x")
 	(vec_concat:V4SI
@@ -5576,6 +7967,49 @@
    (set_attr "prefix_extra" "1")
    (set_attr "mode" "DI")])
 
+(define_insn "*avx_phaddswv8hi3"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+	(vec_concat:V8HI
+	  (vec_concat:V4HI
+	    (vec_concat:V2HI
+	      (ss_plus:HI
+		(vec_select:HI
+		  (match_operand:V8HI 1 "register_operand" "x")
+		  (parallel [(const_int 0)]))
+		(vec_select:HI (match_dup 1) (parallel [(const_int 1)])))
+	      (ss_plus:HI
+		(vec_select:HI (match_dup 1) (parallel [(const_int 2)]))
+		(vec_select:HI (match_dup 1) (parallel [(const_int 3)]))))
+	    (vec_concat:V2HI
+	      (ss_plus:HI
+		(vec_select:HI (match_dup 1) (parallel [(const_int 4)]))
+		(vec_select:HI (match_dup 1) (parallel [(const_int 5)])))
+	      (ss_plus:HI
+		(vec_select:HI (match_dup 1) (parallel [(const_int 6)]))
+		(vec_select:HI (match_dup 1) (parallel [(const_int 7)])))))
+	  (vec_concat:V4HI
+	    (vec_concat:V2HI
+	      (ss_plus:HI
+		(vec_select:HI
+		  (match_operand:V8HI 2 "nonimmediate_operand" "xm")
+		  (parallel [(const_int 0)]))
+		(vec_select:HI (match_dup 2) (parallel [(const_int 1)])))
+	      (ss_plus:HI
+		(vec_select:HI (match_dup 2) (parallel [(const_int 2)]))
+		(vec_select:HI (match_dup 2) (parallel [(const_int 3)]))))
+	    (vec_concat:V2HI
+	      (ss_plus:HI
+		(vec_select:HI (match_dup 2) (parallel [(const_int 4)]))
+		(vec_select:HI (match_dup 2) (parallel [(const_int 5)])))
+	      (ss_plus:HI
+		(vec_select:HI (match_dup 2) (parallel [(const_int 6)]))
+		(vec_select:HI (match_dup 2) (parallel [(const_int 7)])))))))]
+  "TARGET_AVX"
+  "vphaddsw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "ssse3_phaddswv8hi3"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
 	(vec_concat:V8HI
@@ -5647,6 +8081,49 @@
    (set_attr "prefix_extra" "1")
    (set_attr "mode" "DI")])
 
+(define_insn "*avx_phsubwv8hi3"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+	(vec_concat:V8HI
+	  (vec_concat:V4HI
+	    (vec_concat:V2HI
+	      (minus:HI
+		(vec_select:HI
+		  (match_operand:V8HI 1 "register_operand" "x")
+		  (parallel [(const_int 0)]))
+		(vec_select:HI (match_dup 1) (parallel [(const_int 1)])))
+	      (minus:HI
+		(vec_select:HI (match_dup 1) (parallel [(const_int 2)]))
+		(vec_select:HI (match_dup 1) (parallel [(const_int 3)]))))
+	    (vec_concat:V2HI
+	      (minus:HI
+		(vec_select:HI (match_dup 1) (parallel [(const_int 4)]))
+		(vec_select:HI (match_dup 1) (parallel [(const_int 5)])))
+	      (minus:HI
+		(vec_select:HI (match_dup 1) (parallel [(const_int 6)]))
+		(vec_select:HI (match_dup 1) (parallel [(const_int 7)])))))
+	  (vec_concat:V4HI
+	    (vec_concat:V2HI
+	      (minus:HI
+		(vec_select:HI
+		  (match_operand:V8HI 2 "nonimmediate_operand" "xm")
+		  (parallel [(const_int 0)]))
+		(vec_select:HI (match_dup 2) (parallel [(const_int 1)])))
+	      (minus:HI
+		(vec_select:HI (match_dup 2) (parallel [(const_int 2)]))
+		(vec_select:HI (match_dup 2) (parallel [(const_int 3)]))))
+	    (vec_concat:V2HI
+	      (minus:HI
+		(vec_select:HI (match_dup 2) (parallel [(const_int 4)]))
+		(vec_select:HI (match_dup 2) (parallel [(const_int 5)])))
+	      (minus:HI
+		(vec_select:HI (match_dup 2) (parallel [(const_int 6)]))
+		(vec_select:HI (match_dup 2) (parallel [(const_int 7)])))))))]
+  "TARGET_AVX"
+  "vphsubw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "ssse3_phsubwv8hi3"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
 	(vec_concat:V8HI
@@ -5718,6 +8195,33 @@
    (set_attr "prefix_extra" "1")
    (set_attr "mode" "DI")])
 
+(define_insn "*avx_phsubdv4si3"
+  [(set (match_operand:V4SI 0 "register_operand" "=x")
+	(vec_concat:V4SI
+	  (vec_concat:V2SI
+	    (minus:SI
+	      (vec_select:SI
+		(match_operand:V4SI 1 "register_operand" "x")
+		(parallel [(const_int 0)]))
+	      (vec_select:SI (match_dup 1) (parallel [(const_int 1)])))
+	    (minus:SI
+	      (vec_select:SI (match_dup 1) (parallel [(const_int 2)]))
+	      (vec_select:SI (match_dup 1) (parallel [(const_int 3)]))))
+	  (vec_concat:V2SI
+	    (minus:SI
+	      (vec_select:SI
+		(match_operand:V4SI 2 "nonimmediate_operand" "xm")
+		(parallel [(const_int 0)]))
+	      (vec_select:SI (match_dup 2) (parallel [(const_int 1)])))
+	    (minus:SI
+	      (vec_select:SI (match_dup 2) (parallel [(const_int 2)]))
+	      (vec_select:SI (match_dup 2) (parallel [(const_int 3)]))))))]
+  "TARGET_AVX"
+  "vphsubd\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "ssse3_phsubdv4si3"
   [(set (match_operand:V4SI 0 "register_operand" "=x")
 	(vec_concat:V4SI
@@ -5765,6 +8269,49 @@
    (set_attr "prefix_extra" "1")
    (set_attr "mode" "DI")])
 
+(define_insn "*avx_phsubswv8hi3"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+	(vec_concat:V8HI
+	  (vec_concat:V4HI
+	    (vec_concat:V2HI
+	      (ss_minus:HI
+		(vec_select:HI
+		  (match_operand:V8HI 1 "register_operand" "x")
+		  (parallel [(const_int 0)]))
+		(vec_select:HI (match_dup 1) (parallel [(const_int 1)])))
+	      (ss_minus:HI
+		(vec_select:HI (match_dup 1) (parallel [(const_int 2)]))
+		(vec_select:HI (match_dup 1) (parallel [(const_int 3)]))))
+	    (vec_concat:V2HI
+	      (ss_minus:HI
+		(vec_select:HI (match_dup 1) (parallel [(const_int 4)]))
+		(vec_select:HI (match_dup 1) (parallel [(const_int 5)])))
+	      (ss_minus:HI
+		(vec_select:HI (match_dup 1) (parallel [(const_int 6)]))
+		(vec_select:HI (match_dup 1) (parallel [(const_int 7)])))))
+	  (vec_concat:V4HI
+	    (vec_concat:V2HI
+	      (ss_minus:HI
+		(vec_select:HI
+		  (match_operand:V8HI 2 "nonimmediate_operand" "xm")
+		  (parallel [(const_int 0)]))
+		(vec_select:HI (match_dup 2) (parallel [(const_int 1)])))
+	      (ss_minus:HI
+		(vec_select:HI (match_dup 2) (parallel [(const_int 2)]))
+		(vec_select:HI (match_dup 2) (parallel [(const_int 3)]))))
+	    (vec_concat:V2HI
+	      (ss_minus:HI
+		(vec_select:HI (match_dup 2) (parallel [(const_int 4)]))
+		(vec_select:HI (match_dup 2) (parallel [(const_int 5)])))
+	      (ss_minus:HI
+		(vec_select:HI (match_dup 2) (parallel [(const_int 6)]))
+		(vec_select:HI (match_dup 2) (parallel [(const_int 7)])))))))]
+  "TARGET_AVX"
+  "vphsubsw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "ssse3_phsubswv8hi3"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
 	(vec_concat:V8HI
@@ -5833,8 +8380,61 @@
   "TARGET_SSSE3"
   "phsubsw\t{%2, %0|%0, %2}"
   [(set_attr "type" "sseiadd")
-   (set_attr "prefix_extra" "1")
-   (set_attr "mode" "DI")])
+   (set_attr "prefix_extra" "1")
+   (set_attr "mode" "DI")])
+
+(define_insn "*avx_pmaddubsw128"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+	(ss_plus:V8HI
+	  (mult:V8HI
+	    (zero_extend:V8HI
+	      (vec_select:V4QI
+		(match_operand:V16QI 1 "register_operand" "x")
+		(parallel [(const_int 0)
+			   (const_int 2)
+			   (const_int 4)
+			   (const_int 6)
+			   (const_int 8)
+			   (const_int 10)
+			   (const_int 12)
+			   (const_int 14)])))
+	    (sign_extend:V8HI
+	      (vec_select:V8QI
+		(match_operand:V16QI 2 "nonimmediate_operand" "xm")
+		(parallel [(const_int 0)
+			   (const_int 2)
+			   (const_int 4)
+			   (const_int 6)
+			   (const_int 8)
+			   (const_int 10)
+			   (const_int 12)
+			   (const_int 14)]))))
+	  (mult:V8HI
+	    (zero_extend:V8HI
+	      (vec_select:V16QI (match_dup 1)
+		(parallel [(const_int 1)
+			   (const_int 3)
+			   (const_int 5)
+			   (const_int 7)
+			   (const_int 9)
+			   (const_int 11)
+			   (const_int 13)
+			   (const_int 15)])))
+	    (sign_extend:V8HI
+	      (vec_select:V16QI (match_dup 2)
+		(parallel [(const_int 1)
+			   (const_int 3)
+			   (const_int 5)
+			   (const_int 7)
+			   (const_int 9)
+			   (const_int 11)
+			   (const_int 13)
+			   (const_int 15)]))))))]
+  "TARGET_AVX"
+  "vpmaddubsw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseiadd")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
 
 (define_insn "ssse3_pmaddubsw128"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
@@ -5947,6 +8547,29 @@
   "TARGET_SSSE3"
   "ix86_fixup_binary_operands_no_copy (MULT, V8HImode, operands);")
 
+(define_insn "*avx_pmulhrswv8hi3"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+	(truncate:V8HI
+	  (lshiftrt:V8SI
+	    (plus:V8SI
+	      (lshiftrt:V8SI
+		(mult:V8SI
+		  (sign_extend:V8SI
+		    (match_operand:V8HI 1 "nonimmediate_operand" "%x"))
+		  (sign_extend:V8SI
+		    (match_operand:V8HI 2 "nonimmediate_operand" "xm")))
+		(const_int 14))
+	      (const_vector:V8HI [(const_int 1) (const_int 1)
+				  (const_int 1) (const_int 1)
+				  (const_int 1) (const_int 1)
+				  (const_int 1) (const_int 1)]))
+	    (const_int 1))))]
+  "TARGET_AVX && ix86_binary_operator_ok (MULT, V8HImode, operands)"
+  "vpmulhrsw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sseimul")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "*ssse3_pmulhrswv8hi3"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
 	(truncate:V8HI
@@ -6010,6 +8633,17 @@
    (set_attr "prefix_extra" "1")
    (set_attr "mode" "DI")])
 
+(define_insn "*avx_pshufbv16qi3"
+  [(set (match_operand:V16QI 0 "register_operand" "=x")
+	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "x")
+		       (match_operand:V16QI 2 "nonimmediate_operand" "xm")]
+		      UNSPEC_PSHUFB))]
+  "TARGET_AVX"
+  "vpshufb\t{%2, %1, %0|%0, %1, %2}";
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "ssse3_pshufbv16qi3"
   [(set (match_operand:V16QI 0 "register_operand" "=x")
 	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "0")
@@ -6033,6 +8667,18 @@
    (set_attr "prefix_extra" "1")
    (set_attr "mode" "DI")])
 
+(define_insn "*avx_psign<mode>3"
+  [(set (match_operand:SSEMODE124 0 "register_operand" "=x")
+	(unspec:SSEMODE124
+	  [(match_operand:SSEMODE124 1 "register_operand" "x")
+	   (match_operand:SSEMODE124 2 "nonimmediate_operand" "xm")]
+	  UNSPEC_PSIGN))]
+  "TARGET_AVX"
+  "vpsign<ssevecsize>\t{%2, %1, %0|%0, %1, %2}";
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "ssse3_psign<mode>3"
   [(set (match_operand:SSEMODE124 0 "register_operand" "=x")
 	(unspec:SSEMODE124
@@ -6058,6 +8704,21 @@
    (set_attr "prefix_extra" "1")
    (set_attr "mode" "DI")])
 
+(define_insn "*avx_palignrti"
+  [(set (match_operand:TI 0 "register_operand" "=x")
+	(unspec:TI [(match_operand:TI 1 "register_operand" "x")
+		    (match_operand:TI 2 "nonimmediate_operand" "xm")
+		    (match_operand:SI 3 "const_0_to_255_mul_8_operand" "n")]
+		   UNSPEC_PALIGNR))]
+  "TARGET_AVX"
+{
+  operands[3] = GEN_INT (INTVAL (operands[3]) / 8);
+  return "vpalignr\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+}
+  [(set_attr "type" "sseishft")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "ssse3_palignrti"
   [(set (match_operand:TI 0 "register_operand" "=x")
 	(unspec:TI [(match_operand:TI 1 "register_operand" "0")
@@ -6093,10 +8754,11 @@
   [(set (match_operand:SSEMODE124 0 "register_operand" "=x")
 	(abs:SSEMODE124 (match_operand:SSEMODE124 1 "nonimmediate_operand" "xm")))]
   "TARGET_SSSE3"
-  "pabs<ssevecsize>\t{%1, %0|%0, %1}";
+  "%vpabs<ssevecsize>\t{%1, %0|%0, %1}"
   [(set_attr "type" "sselog1")
    (set_attr "prefix_data16" "1")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "abs<mode>2"
@@ -6189,6 +8851,31 @@
 ;;
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
+(define_insn "avx_blendp<avxmodesuffixf2c><avxmodesuffix>"
+  [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x")
+	(vec_merge:AVXMODEF2P
+	  (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")
+	  (match_operand:AVXMODEF2P 1 "register_operand" "x")
+	  (match_operand:SI 3 "const_0_to_<blendbits>_operand" "n")))]
+  "TARGET_AVX"
+  "vblendp<avxmodesuffixf2c>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<avxvecmode>")])
+
+(define_insn "avx_blendvp<avxmodesuffixf2c><avxmodesuffix>"
+  [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x")
+	(unspec:AVXMODEF2P
+	  [(match_operand:AVXMODEF2P 1 "register_operand" "x")
+	   (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")
+	   (match_operand:AVXMODEF2P 3 "register_operand" "x")]
+	  UNSPEC_BLENDV))]
+  "TARGET_AVX"
+  "vblendvp<avxmodesuffixf2c>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<avxvecmode>")])
+
 (define_insn "sse4_1_blendp<ssemodesuffixf2c>"
   [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x")
 	(vec_merge:SSEMODEF2P
@@ -6214,6 +8901,19 @@
    (set_attr "prefix_extra" "1")
    (set_attr "mode" "<MODE>")])
 
+(define_insn "avx_dpp<avxmodesuffixf2c><avxmodesuffix>"
+  [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x")
+	(unspec:AVXMODEF2P
+	  [(match_operand:AVXMODEF2P 1 "nonimmediate_operand" "%x")
+	   (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "xm")
+	   (match_operand:SI 3 "const_0_to_255_operand" "n")]
+	  UNSPEC_DP))]
+  "TARGET_AVX"
+  "vdpp<avxmodesuffixf2c>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemul")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<avxvecmode>")])
+
 (define_insn "sse4_1_dpp<ssemodesuffixf2c>"
   [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x")
 	(unspec:SSEMODEF2P
@@ -6232,9 +8932,22 @@
 	(unspec:V2DI [(match_operand:V2DI 1 "memory_operand" "m")]
 		     UNSPEC_MOVNTDQA))]
   "TARGET_SSE4_1"
-  "movntdqa\t{%1, %0|%0, %1}"
+  "%vmovntdqa\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecvt")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "TI")])
+
+(define_insn "*avx_mpsadbw"
+  [(set (match_operand:V16QI 0 "register_operand" "=x")
+	(unspec:V16QI [(match_operand:V16QI 1 "register_operand" "x")
+		       (match_operand:V16QI 2 "nonimmediate_operand" "xm")
+		       (match_operand:SI 3 "const_0_to_255_operand" "n")]
+		      UNSPEC_MPSADBW))]
+  "TARGET_AVX"
+  "vmpsadbw\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "vex")
    (set_attr "mode" "TI")])
 
 (define_insn "sse4_1_mpsadbw"
@@ -6249,6 +8962,19 @@
    (set_attr "prefix_extra" "1")
    (set_attr "mode" "TI")])
 
+(define_insn "*avx_packusdw"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+	(vec_concat:V8HI
+	  (us_truncate:V4HI
+	    (match_operand:V4SI 1 "register_operand" "x"))
+	  (us_truncate:V4HI
+	    (match_operand:V4SI 2 "nonimmediate_operand" "xm"))))]
+  "TARGET_AVX"
+  "vpackusdw\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "sse4_1_packusdw"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
 	(vec_concat:V8HI
@@ -6262,6 +8988,18 @@
    (set_attr "prefix_extra" "1")
    (set_attr "mode" "TI")])
 
+(define_insn "*avx_pblendvb"
+  [(set (match_operand:V16QI 0 "register_operand" "=x")
+	(unspec:V16QI [(match_operand:V16QI 1 "register_operand"  "x")
+		       (match_operand:V16QI 2 "nonimmediate_operand" "xm")
+		       (match_operand:V16QI 3 "register_operand" "x")]
+		      UNSPEC_BLENDV))]
+  "TARGET_AVX"
+  "vpblendvb\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "sse4_1_pblendvb"
   [(set (match_operand:V16QI 0 "reg_not_xmm0_operand" "=x")
 	(unspec:V16QI [(match_operand:V16QI 1 "reg_not_xmm0_operand"  "0")
@@ -6274,6 +9012,18 @@
    (set_attr "prefix_extra" "1")
    (set_attr "mode" "TI")])
 
+(define_insn "*avx_pblendw"
+  [(set (match_operand:V8HI 0 "register_operand" "=x")
+	(vec_merge:V8HI
+	  (match_operand:V8HI 2 "nonimmediate_operand" "xm")
+	  (match_operand:V8HI 1 "register_operand" "x")
+	  (match_operand:SI 3 "const_0_to_255_operand" "n")))]
+  "TARGET_AVX"
+  "vpblendw\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "sse4_1_pblendw"
   [(set (match_operand:V8HI 0 "register_operand" "=x")
 	(vec_merge:V8HI
@@ -6291,9 +9041,10 @@
 	(unspec:V8HI [(match_operand:V8HI 1 "nonimmediate_operand" "xm")]
 		     UNSPEC_PHMINPOSUW))]
   "TARGET_SSE4_1"
-  "phminposuw\t{%1, %0|%0, %1}"
+  "%vphminposuw\t{%1, %0|%0, %1}"
   [(set_attr "type" "sselog1")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "sse4_1_extendv8qiv8hi2"
@@ -6310,9 +9061,10 @@
 		       (const_int 6)
 		       (const_int 7)]))))]
   "TARGET_SSE4_1"
-  "pmovsxbw\t{%1, %0|%0, %1}"
+  "%vpmovsxbw\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "*sse4_1_extendv8qiv8hi2"
@@ -6330,9 +9082,10 @@
 		       (const_int 6)
 		       (const_int 7)]))))]
   "TARGET_SSE4_1"
-  "pmovsxbw\t{%1, %0|%0, %1}"
+  "%vpmovsxbw\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "sse4_1_extendv4qiv4si2"
@@ -6345,9 +9098,10 @@
 		       (const_int 2)
 		       (const_int 3)]))))]
   "TARGET_SSE4_1"
-  "pmovsxbd\t{%1, %0|%0, %1}"
+  "%vpmovsxbd\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "*sse4_1_extendv4qiv4si2"
@@ -6361,9 +9115,10 @@
 		       (const_int 2)
 		       (const_int 3)]))))]
   "TARGET_SSE4_1"
-  "pmovsxbd\t{%1, %0|%0, %1}"
+  "%vpmovsxbd\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "sse4_1_extendv2qiv2di2"
@@ -6374,9 +9129,10 @@
 	    (parallel [(const_int 0)
 		       (const_int 1)]))))]
   "TARGET_SSE4_1"
-  "pmovsxbq\t{%1, %0|%0, %1}"
+  "%vpmovsxbq\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "*sse4_1_extendv2qiv2di2"
@@ -6388,9 +9144,10 @@
 	    (parallel [(const_int 0)
 		       (const_int 1)]))))]
   "TARGET_SSE4_1"
-  "pmovsxbq\t{%1, %0|%0, %1}"
+  "%vpmovsxbq\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "sse4_1_extendv4hiv4si2"
@@ -6403,9 +9160,10 @@
 		       (const_int 2)
 		       (const_int 3)]))))]
   "TARGET_SSE4_1"
-  "pmovsxwd\t{%1, %0|%0, %1}"
+  "%vpmovsxwd\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "*sse4_1_extendv4hiv4si2"
@@ -6419,9 +9177,10 @@
 		       (const_int 2)
 		       (const_int 3)]))))]
   "TARGET_SSE4_1"
-  "pmovsxwd\t{%1, %0|%0, %1}"
+  "%vpmovsxwd\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "sse4_1_extendv2hiv2di2"
@@ -6432,9 +9191,10 @@
 	    (parallel [(const_int 0)
 		       (const_int 1)]))))]
   "TARGET_SSE4_1"
-  "pmovsxwq\t{%1, %0|%0, %1}"
+  "%vpmovsxwq\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "*sse4_1_extendv2hiv2di2"
@@ -6446,9 +9206,10 @@
 	    (parallel [(const_int 0)
 		       (const_int 1)]))))]
   "TARGET_SSE4_1"
-  "pmovsxwq\t{%1, %0|%0, %1}"
+  "%vpmovsxwq\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "sse4_1_extendv2siv2di2"
@@ -6459,9 +9220,10 @@
 	    (parallel [(const_int 0)
 		       (const_int 1)]))))]
   "TARGET_SSE4_1"
-  "pmovsxdq\t{%1, %0|%0, %1}"
+  "%vpmovsxdq\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "*sse4_1_extendv2siv2di2"
@@ -6473,9 +9235,10 @@
 	    (parallel [(const_int 0)
 		       (const_int 1)]))))]
   "TARGET_SSE4_1"
-  "pmovsxdq\t{%1, %0|%0, %1}"
+  "%vpmovsxdq\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "sse4_1_zero_extendv8qiv8hi2"
@@ -6492,9 +9255,10 @@
 		       (const_int 6)
 		       (const_int 7)]))))]
   "TARGET_SSE4_1"
-  "pmovzxbw\t{%1, %0|%0, %1}"
+  "%vpmovzxbw\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "*sse4_1_zero_extendv8qiv8hi2"
@@ -6512,9 +9276,10 @@
 		       (const_int 6)
 		       (const_int 7)]))))]
   "TARGET_SSE4_1"
-  "pmovzxbw\t{%1, %0|%0, %1}"
+  "%vpmovzxbw\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "sse4_1_zero_extendv4qiv4si2"
@@ -6527,9 +9292,10 @@
 		       (const_int 2)
 		       (const_int 3)]))))]
   "TARGET_SSE4_1"
-  "pmovzxbd\t{%1, %0|%0, %1}"
+  "%vpmovzxbd\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "*sse4_1_zero_extendv4qiv4si2"
@@ -6543,9 +9309,10 @@
 		       (const_int 2)
 		       (const_int 3)]))))]
   "TARGET_SSE4_1"
-  "pmovzxbd\t{%1, %0|%0, %1}"
+  "%vpmovzxbd\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "sse4_1_zero_extendv2qiv2di2"
@@ -6556,9 +9323,10 @@
 	    (parallel [(const_int 0)
 		       (const_int 1)]))))]
   "TARGET_SSE4_1"
-  "pmovzxbq\t{%1, %0|%0, %1}"
+  "%vpmovzxbq\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "*sse4_1_zero_extendv2qiv2di2"
@@ -6570,9 +9338,10 @@
 	    (parallel [(const_int 0)
 		       (const_int 1)]))))]
   "TARGET_SSE4_1"
-  "pmovzxbq\t{%1, %0|%0, %1}"
+  "%vpmovzxbq\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "sse4_1_zero_extendv4hiv4si2"
@@ -6585,9 +9354,10 @@
 		       (const_int 2)
 		       (const_int 3)]))))]
   "TARGET_SSE4_1"
-  "pmovzxwd\t{%1, %0|%0, %1}"
+  "%vpmovzxwd\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "*sse4_1_zero_extendv4hiv4si2"
@@ -6601,9 +9371,10 @@
 		       (const_int 2)
 		       (const_int 3)]))))]
   "TARGET_SSE4_1"
-  "pmovzxwd\t{%1, %0|%0, %1}"
+  "%vpmovzxwd\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "sse4_1_zero_extendv2hiv2di2"
@@ -6614,9 +9385,10 @@
 	    (parallel [(const_int 0)
 		       (const_int 1)]))))]
   "TARGET_SSE4_1"
-  "pmovzxwq\t{%1, %0|%0, %1}"
+  "%vpmovzxwq\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "*sse4_1_zero_extendv2hiv2di2"
@@ -6628,9 +9400,10 @@
 	    (parallel [(const_int 0)
 		       (const_int 1)]))))]
   "TARGET_SSE4_1"
-  "pmovzxwq\t{%1, %0|%0, %1}"
+  "%vpmovzxwq\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "sse4_1_zero_extendv2siv2di2"
@@ -6641,9 +9414,10 @@
 	    (parallel [(const_int 0)
 		       (const_int 1)]))))]
   "TARGET_SSE4_1"
-  "pmovzxdq\t{%1, %0|%0, %1}"
+  "%vpmovzxdq\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "*sse4_1_zero_extendv2siv2di2"
@@ -6655,24 +9429,62 @@
 	    (parallel [(const_int 0)
 		       (const_int 1)]))))]
   "TARGET_SSE4_1"
-  "pmovzxdq\t{%1, %0|%0, %1}"
+  "%vpmovzxdq\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssemov")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
+;; ptestps/ptestpd are very similar to comiss and ucomiss when
+;; setting FLAGS_REG. But it is not a really compare instruction.
+(define_insn "avx_vtestp<avxmodesuffixf2c><avxmodesuffix>"
+  [(set (reg:CC FLAGS_REG)
+	(unspec:CC [(match_operand:AVXMODEF2P 0 "register_operand" "x")
+		    (match_operand:AVXMODEF2P 1 "nonimmediate_operand" "xm")]
+		   UNSPEC_VTESTP))]
+  "TARGET_AVX"
+  "vtestp<avxmodesuffixf2c>\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecomi")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
 ;; ptest is very similar to comiss and ucomiss when setting FLAGS_REG.
 ;; But it is not a really compare instruction.
+(define_insn "avx_ptest256"
+  [(set (reg:CC FLAGS_REG)
+	(unspec:CC [(match_operand:V4DI 0 "register_operand" "x")
+		    (match_operand:V4DI 1 "nonimmediate_operand" "xm")]
+		   UNSPEC_PTEST))]
+  "TARGET_AVX"
+  "vptest\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssecomi")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
 (define_insn "sse4_1_ptest"
   [(set (reg:CC FLAGS_REG)
 	(unspec:CC [(match_operand:V2DI 0 "register_operand" "x")
 		    (match_operand:V2DI 1 "nonimmediate_operand" "xm")]
 		   UNSPEC_PTEST))]
   "TARGET_SSE4_1"
-  "ptest\t{%1, %0|%0, %1}"
+  "%vptest\t{%1, %0|%0, %1}"
   [(set_attr "type" "ssecomi")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
+(define_insn "avx_roundp<avxmodesuffixf2c>256"
+  [(set (match_operand:AVX256MODEF2P 0 "register_operand" "=x")
+	(unspec:AVX256MODEF2P
+	  [(match_operand:AVX256MODEF2P 1 "nonimmediate_operand" "xm")
+	   (match_operand:SI 2 "const_0_to_15_operand" "n")]
+	  UNSPEC_ROUND))]
+  "TARGET_AVX"
+  "vroundp<avxmodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
 (define_insn "sse4_1_roundp<ssemodesuffixf2c>"
   [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x")
 	(unspec:SSEMODEF2P
@@ -6680,9 +9492,25 @@
 	   (match_operand:SI 2 "const_0_to_15_operand" "n")]
 	  UNSPEC_ROUND))]
   "TARGET_ROUND"
-  "roundp<ssemodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}"
+  "%vroundp<ssemodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "ssecvt")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "*avx_rounds<ssemodesuffixf2c>"
+  [(set (match_operand:SSEMODEF2P 0 "register_operand" "=x")
+	(vec_merge:SSEMODEF2P
+	  (unspec:SSEMODEF2P
+	    [(match_operand:SSEMODEF2P 2 "register_operand" "x")
+	     (match_operand:SI 3 "const_0_to_15_operand" "n")]
+	    UNSPEC_ROUND)
+	  (match_operand:SSEMODEF2P 1 "register_operand" "x")
+	  (const_int 1)))]
+  "TARGET_AVX"
+  "vrounds<ssemodesuffixf2c>\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "ssecvt")
+   (set_attr "prefix" "vex")
    (set_attr "mode" "<MODE>")])
 
 (define_insn "sse4_1_rounds<ssemodesuffixf2c>"
@@ -6780,10 +9608,11 @@
 	   (match_dup 5)]
 	  UNSPEC_PCMPESTR))]
   "TARGET_SSE4_2"
-  "pcmpestri\t{%5, %3, %1|%1, %3, %5}"
+  "%vpcmpestri\t{%5, %3, %1|%1, %3, %5}"
   [(set_attr "type" "sselog")
    (set_attr "prefix_data16" "1")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "memory" "none,load")
    (set_attr "mode" "TI")])
 
@@ -6805,10 +9634,11 @@
 	   (match_dup 5)]
 	  UNSPEC_PCMPESTR))]
   "TARGET_SSE4_2"
-  "pcmpestrm\t{%5, %3, %1|%1, %3, %5}"
+  "%vpcmpestrm\t{%5, %3, %1|%1, %3, %5}"
   [(set_attr "type" "sselog")
    (set_attr "prefix_data16" "1")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "memory" "none,load")
    (set_attr "mode" "TI")])
 
@@ -6825,14 +9655,15 @@
    (clobber (match_scratch:SI    1 "= X, X,c,c"))]
   "TARGET_SSE4_2"
   "@
-   pcmpestrm\t{%6, %4, %2|%2, %4, %6}
-   pcmpestrm\t{%6, %4, %2|%2, %4, %6}
-   pcmpestri\t{%6, %4, %2|%2, %4, %6}
-   pcmpestri\t{%6, %4, %2|%2, %4, %6}"
+   %vpcmpestrm\t{%6, %4, %2|%2, %4, %6}
+   %vpcmpestrm\t{%6, %4, %2|%2, %4, %6}
+   %vpcmpestri\t{%6, %4, %2|%2, %4, %6}
+   %vpcmpestri\t{%6, %4, %2|%2, %4, %6}"
   [(set_attr "type" "sselog")
    (set_attr "prefix_data16" "1")
    (set_attr "prefix_extra" "1")
    (set_attr "memory" "none,load,none,load")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn_and_split "sse4_2_pcmpistr"
@@ -6896,10 +9727,11 @@
 	   (match_dup 3)]
 	  UNSPEC_PCMPISTR))]
   "TARGET_SSE4_2"
-  "pcmpistri\t{%3, %2, %1|%1, %2, %3}"
+  "%vpcmpistri\t{%3, %2, %1|%1, %2, %3}"
   [(set_attr "type" "sselog")
    (set_attr "prefix_data16" "1")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "memory" "none,load")
    (set_attr "mode" "TI")])
 
@@ -6917,10 +9749,11 @@
 	   (match_dup 3)]
 	  UNSPEC_PCMPISTR))]
   "TARGET_SSE4_2"
-  "pcmpistrm\t{%3, %2, %1|%1, %2, %3}"
+  "%vpcmpistrm\t{%3, %2, %1|%1, %2, %3}"
   [(set_attr "type" "sselog")
    (set_attr "prefix_data16" "1")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "memory" "none,load")
    (set_attr "mode" "TI")])
 
@@ -6935,14 +9768,15 @@
    (clobber (match_scratch:SI    1 "= X, X,c,c"))]
   "TARGET_SSE4_2"
   "@
-   pcmpistrm\t{%4, %3, %2|%2, %3, %4}
-   pcmpistrm\t{%4, %3, %2|%2, %3, %4}
-   pcmpistri\t{%4, %3, %2|%2, %3, %4}
-   pcmpistri\t{%4, %3, %2|%2, %3, %4}"
+   %vpcmpistrm\t{%4, %3, %2|%2, %3, %4}
+   %vpcmpistrm\t{%4, %3, %2|%2, %3, %4}
+   %vpcmpistri\t{%4, %3, %2|%2, %3, %4}
+   %vpcmpistri\t{%4, %3, %2|%2, %3, %4}"
   [(set_attr "type" "sselog")
    (set_attr "prefix_data16" "1")
    (set_attr "prefix_extra" "1")
    (set_attr "memory" "none,load,none,load")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
@@ -8541,6 +11375,17 @@
   [(set_attr "type" "ssecmp")
    (set_attr "mode" "TI")])
 
+(define_insn "*avx_aesenc"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "x")
+		       (match_operand:V2DI 2 "nonimmediate_operand" "xm")]
+		      UNSPEC_AESENC))]
+  "TARGET_AES && TARGET_AVX"
+  "vaesenc\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "aesenc"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
 	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
@@ -8552,6 +11397,17 @@
    (set_attr "prefix_extra" "1")
    (set_attr "mode" "TI")])
 
+(define_insn "*avx_aesenclast"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "x")
+		       (match_operand:V2DI 2 "nonimmediate_operand" "xm")]
+		      UNSPEC_AESENCLAST))]
+  "TARGET_AES && TARGET_AVX"
+  "vaesenclast\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "aesenclast"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
 	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
@@ -8563,6 +11419,17 @@
    (set_attr "prefix_extra" "1")
    (set_attr "mode" "TI")])
 
+(define_insn "*avx_aesdec"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "x")
+		       (match_operand:V2DI 2 "nonimmediate_operand" "xm")]
+		      UNSPEC_AESDEC))]
+  "TARGET_AES && TARGET_AVX"
+  "vaesdec\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "aesdec"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
 	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
@@ -8574,6 +11441,17 @@
    (set_attr "prefix_extra" "1")
    (set_attr "mode" "TI")])
 
+(define_insn "*avx_aesdeclast"
+  [(set (match_operand:V2DI 0 "register_operand" "=x")
+	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "x")
+		       (match_operand:V2DI 2 "nonimmediate_operand" "xm")]
+		      UNSPEC_AESDECLAST))]
+  "TARGET_AES && TARGET_AVX"
+  "vaesdeclast\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "TI")])
+
 (define_insn "aesdeclast"
   [(set (match_operand:V2DI 0 "register_operand" "=x")
 	(unspec:V2DI [(match_operand:V2DI 1 "register_operand" "0")
@@ -8590,9 +11468,10 @@
 	(unspec:V2DI [(match_operand:V2DI 1 "nonimmediate_operand" "xm")]
 		      UNSPEC_AESIMC))]
   "TARGET_AES"
-  "aesimc\t{%1, %0|%0, %1}"
+  "%vaesimc\t{%1, %0|%0, %1}"
   [(set_attr "type" "sselog1")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "aeskeygenassist"
@@ -8601,9 +11480,10 @@
 		      (match_operand:SI 2 "const_0_to_255_operand" "n")]
 		     UNSPEC_AESKEYGENASSIST))]
   "TARGET_AES"
-  "aeskeygenassist\t{%2, %1, %0|%0, %1, %2}"
+  "%vaeskeygenassist\t{%2, %1, %0|%0, %1, %2}"
   [(set_attr "type" "sselog1")
    (set_attr "prefix_extra" "1")
+   (set_attr "prefix" "maybe_vex")
    (set_attr "mode" "TI")])
 
 (define_insn "pclmulqdq"
@@ -8617,3 +11497,463 @@
   [(set_attr "type" "sselog1")
    (set_attr "prefix_extra" "1")
    (set_attr "mode" "TI")])
+
+(define_expand "avx_vzeroall"
+  [(match_par_dup 0 [(const_int 0)])]
+  "TARGET_AVX"
+{
+  int nregs = TARGET_64BIT ? 16 : 8;
+  int regno;
+
+  operands[0] = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (nregs + 1));
+
+  XVECEXP (operands[0], 0, 0)
+    = gen_rtx_UNSPEC_VOLATILE (VOIDmode, gen_rtvec (1, const0_rtx),
+			       UNSPECV_VZEROALL);
+
+  for (regno = 0; regno < nregs; regno++)
+    XVECEXP (operands[0], 0, regno + 1)
+      = gen_rtx_SET (VOIDmode,
+		     gen_rtx_REG (V8SImode, SSE_REGNO (regno)),
+		     CONST0_RTX (V8SImode));
+})
+
+(define_insn "*avx_vzeroall"
+  [(match_parallel 0 "vzeroall_operation"
+    [(unspec_volatile [(const_int 0)] UNSPECV_VZEROALL)
+     (set (match_operand 1 "register_operand" "=x")
+          (match_operand 2 "const0_operand" "X"))])]
+  "TARGET_AVX"
+  "vzeroall"
+  [(set_attr "type" "sse")
+   (set_attr "memory" "none")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+;; vzeroupper clobbers the upper 128bits of AVX registers.
+(define_insn "avx_vzeroupper"
+  [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)
+   (clobber (reg:V8SI XMM0_REG))
+   (clobber (reg:V8SI XMM1_REG))
+   (clobber (reg:V8SI XMM2_REG))
+   (clobber (reg:V8SI XMM3_REG))
+   (clobber (reg:V8SI XMM4_REG))
+   (clobber (reg:V8SI XMM5_REG))
+   (clobber (reg:V8SI XMM6_REG))
+   (clobber (reg:V8SI XMM7_REG))]
+  "TARGET_AVX && !TARGET_64BIT"
+  "vzeroupper"
+  [(set_attr "type" "sse")
+   (set_attr "memory" "none")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "avx_vzeroupper_rex64"
+  [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)
+   (clobber (reg:V8SI XMM0_REG))
+   (clobber (reg:V8SI XMM1_REG))
+   (clobber (reg:V8SI XMM2_REG))
+   (clobber (reg:V8SI XMM3_REG))
+   (clobber (reg:V8SI XMM4_REG))
+   (clobber (reg:V8SI XMM5_REG))
+   (clobber (reg:V8SI XMM6_REG))
+   (clobber (reg:V8SI XMM7_REG))
+   (clobber (reg:V8SI XMM8_REG))
+   (clobber (reg:V8SI XMM9_REG))
+   (clobber (reg:V8SI XMM10_REG))
+   (clobber (reg:V8SI XMM11_REG))
+   (clobber (reg:V8SI XMM12_REG))
+   (clobber (reg:V8SI XMM13_REG))
+   (clobber (reg:V8SI XMM14_REG))
+   (clobber (reg:V8SI XMM15_REG))]
+  "TARGET_AVX && TARGET_64BIT"
+  "vzeroupper"
+  [(set_attr "type" "sse")
+   (set_attr "memory" "none")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "OI")])
+
+(define_insn "avx_vpermil<mode>"
+  [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x")
+	(unspec:AVXMODEF2P
+	  [(match_operand:AVXMODEF2P 1 "register_operand" "xm")
+	   (match_operand:SI 2 "const_0_to_<vpermilbits>_operand" "n")]
+	  UNSPEC_VPERMIL))]
+  "TARGET_AVX"
+  "vpermilp<avxmodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx_vpermilvar<mode>3"
+  [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x")
+	(unspec:AVXMODEF2P
+	  [(match_operand:AVXMODEF2P 1 "register_operand" "x")
+	   (match_operand:<avxpermvecmode> 2 "nonimmediate_operand" "xm")]
+	  UNSPEC_VPERMIL))]
+  "TARGET_AVX"
+  "vpermilp<avxmodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx_vpermil2<mode>3"
+  [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x,x")
+	(unspec:AVXMODEF2P
+	  [(match_operand:AVXMODEF2P 1 "register_operand" "x,x")
+	   (match_operand:AVXMODEF2P 2 "nonimmediate_operand" "x,xm")
+	   (match_operand:<avxpermvecmode> 3 "nonimmediate_operand" "xm,x")
+	   (match_operand:SI 4 "const_0_to_3_operand" "n,n")]
+	  UNSPEC_VPERMIL2))]
+  "TARGET_AVX"
+  "vpermil2p<avxmodesuffixf2c>\t{%4, %3, %2, %1, %0|%0, %1, %2, %3, %4}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx_vperm2f128<mode>3"
+  [(set (match_operand:AVX256MODE2P 0 "register_operand" "=x")
+	(unspec:AVX256MODE2P
+	  [(match_operand:AVX256MODE2P 1 "register_operand" "x")
+	   (match_operand:AVX256MODE2P 2 "nonimmediate_operand" "xm")
+	   (match_operand:SI 3 "const_0_to_255_operand" "n")]
+	  UNSPEC_VPERMIL2F128))]
+  "TARGET_AVX"
+  "vperm2f128\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "avx_vbroadcasts<avxmodesuffixf2c><avxmodesuffix>"
+  [(set (match_operand:AVXMODEF4P 0 "register_operand" "=x")
+	(vec_concat:AVXMODEF4P
+	  (vec_concat:<avxhalfvecmode>
+	    (match_operand:<avxscalarmode> 1 "memory_operand" "m")
+	    (match_dup 1))
+	  (vec_concat:<avxhalfvecmode>
+	    (match_dup 1)
+	    (match_dup 1))))]
+  "TARGET_AVX"
+  "vbroadcasts<avxmodesuffixf2c>\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<avxscalarmode>")])
+
+(define_insn "avx_vbroadcastss256"
+  [(set (match_operand:V8SF 0 "register_operand" "=x")
+	(vec_concat:V8SF
+	  (vec_concat:V4SF
+	    (vec_concat:V2SF
+	      (match_operand:SF 1 "memory_operand" "m")
+	      (match_dup 1))
+	    (vec_concat:V2SF
+	      (match_dup 1)
+	      (match_dup 1)))
+	  (vec_concat:V4SF
+	    (vec_concat:V2SF
+	      (match_dup 1)
+	      (match_dup 1))
+	    (vec_concat:V2SF
+	      (match_dup 1)
+	      (match_dup 1)))))]
+  "TARGET_AVX"
+  "vbroadcastss\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "SF")])
+
+(define_insn "avx_vbroadcastf128_p<avxmodesuffixf2c>256"
+  [(set (match_operand:AVX256MODEF2P 0 "register_operand" "=x")
+	(vec_concat:AVX256MODEF2P
+	  (match_operand:<avxhalfvecmode> 1 "memory_operand" "m")
+	  (match_dup 1)))]
+  "TARGET_AVX"
+  "vbroadcastf128\t{%1, %0|%0, %1}"
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V4SF")])
+
+(define_expand "avx_vinsertf128<mode>"
+  [(match_operand:AVX256MODE 0 "register_operand" "")
+   (match_operand:AVX256MODE 1 "register_operand" "")
+   (match_operand:<avxhalfvecmode> 2 "nonimmediate_operand" "")
+   (match_operand:SI 3 "const_0_to_1_operand" "")]
+  "TARGET_AVX"
+{
+  switch (INTVAL (operands[3]))
+    {
+    case 0:
+      emit_insn (gen_vec_set_lo_<mode> (operands[0], operands[1],
+					operands[2]));
+      break;
+    case 1:
+      emit_insn (gen_vec_set_hi_<mode> (operands[0], operands[1],
+					operands[2]));
+      break;
+    default:
+      gcc_unreachable ();
+    }
+  DONE;
+})
+
+(define_insn "vec_set_lo_<mode>"
+  [(set (match_operand:AVX256MODE4P 0 "register_operand" "=x")
+	(vec_concat:AVX256MODE4P
+	  (match_operand:<avxhalfvecmode> 2 "nonimmediate_operand" "xm")
+	  (vec_select:<avxhalfvecmode>
+	    (match_operand:AVX256MODE4P 1 "register_operand" "x")
+	    (parallel [(const_int 2) (const_int 3)]))))]
+  "TARGET_AVX"
+  "vinsertf128\t{$0x0, %2, %1, %0|%0, %1, %2, 0x0}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "vec_set_hi_<mode>"
+  [(set (match_operand:AVX256MODE4P 0 "register_operand" "=x")
+	(vec_concat:AVX256MODE4P
+	  (vec_select:<avxhalfvecmode>
+	    (match_operand:AVX256MODE4P 1 "register_operand" "x")
+	    (parallel [(const_int 0) (const_int 1)]))
+	  (match_operand:<avxhalfvecmode> 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX"
+  "vinsertf128\t{$0x1, %2, %1, %0|%0, %1, %2, 0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "vec_set_lo_<mode>"
+  [(set (match_operand:AVX256MODE8P 0 "register_operand" "=x")
+	(vec_concat:AVX256MODE8P
+	  (match_operand:<avxhalfvecmode> 2 "nonimmediate_operand" "xm")
+	  (vec_select:<avxhalfvecmode>
+	    (match_operand:AVX256MODE8P 1 "register_operand" "x")
+	    (parallel [(const_int 4) (const_int 5)
+		       (const_int 6) (const_int 7)]))))]
+  "TARGET_AVX"
+  "vinsertf128\t{$0x0, %2, %1, %0|%0, %1, %2, 0x0}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "vec_set_hi_<mode>"
+  [(set (match_operand:AVX256MODE8P 0 "register_operand" "=x")
+	(vec_concat:AVX256MODE8P
+	  (vec_select:<avxhalfvecmode>
+	    (match_operand:AVX256MODE8P 1 "register_operand" "x")
+	    (parallel [(const_int 0) (const_int 1)
+		       (const_int 2) (const_int 3)]))
+	  (match_operand:<avxhalfvecmode> 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX"
+  "vinsertf128\t{$0x1, %2, %1, %0|%0, %1, %2, 0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "vec_set_lo_v16hi"
+  [(set (match_operand:V16HI 0 "register_operand" "=x")
+	(vec_concat:V16HI
+	  (match_operand:V8HI 2 "nonimmediate_operand" "xm")
+	  (vec_select:V8HI
+	    (match_operand:V16HI 1 "register_operand" "x")
+	    (parallel [(const_int 8) (const_int 9)
+		       (const_int 10) (const_int 11)
+		       (const_int 12) (const_int 13)
+		       (const_int 14) (const_int 15)]))))]
+  "TARGET_AVX"
+  "vinsertf128\t{$0x0, %2, %1, %0|%0, %1, %2, 0x0}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "vec_set_hi_v16hi"
+  [(set (match_operand:V16HI 0 "register_operand" "=x")
+	(vec_concat:V16HI
+	  (vec_select:V8HI
+	    (match_operand:V16HI 1 "register_operand" "x")
+	    (parallel [(const_int 0) (const_int 1)
+		       (const_int 2) (const_int 3)
+		       (const_int 4) (const_int 5)
+		       (const_int 6) (const_int 7)]))
+	  (match_operand:V8HI 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX"
+  "vinsertf128\t{$0x1, %2, %1, %0|%0, %1, %2, 0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "vec_set_lo_v32qi"
+  [(set (match_operand:V32QI 0 "register_operand" "=x")
+	(vec_concat:V32QI
+	  (match_operand:V16QI 2 "nonimmediate_operand" "xm")
+	  (vec_select:V16QI
+	    (match_operand:V32QI 1 "register_operand" "x")
+	    (parallel [(const_int 16) (const_int 17)
+		       (const_int 18) (const_int 19)
+		       (const_int 20) (const_int 21)
+		       (const_int 22) (const_int 23)
+		       (const_int 24) (const_int 25)
+		       (const_int 26) (const_int 27)
+		       (const_int 28) (const_int 29)
+		       (const_int 30) (const_int 31)]))))]
+  "TARGET_AVX"
+  "vinsertf128\t{$0x0, %2, %1, %0|%0, %1, %2, 0x0}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "vec_set_hi_v32qi"
+  [(set (match_operand:V32QI 0 "register_operand" "=x")
+	(vec_concat:V32QI
+	  (vec_select:V16QI
+	    (match_operand:V32QI 1 "register_operand" "x")
+	    (parallel [(const_int 0) (const_int 1)
+		       (const_int 2) (const_int 3)
+		       (const_int 4) (const_int 5)
+		       (const_int 6) (const_int 7)
+		       (const_int 8) (const_int 9)
+		       (const_int 10) (const_int 11)
+		       (const_int 12) (const_int 13)
+		       (const_int 14) (const_int 15)]))
+	  (match_operand:V16QI 2 "nonimmediate_operand" "xm")))]
+  "TARGET_AVX"
+  "vinsertf128\t{$0x1, %2, %1, %0|%0, %1, %2, 0x1}"
+  [(set_attr "type" "sselog")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "V8SF")])
+
+(define_insn "avx_maskloadp<avxmodesuffixf2c><avxmodesuffix>"
+  [(set (match_operand:AVXMODEF2P 0 "register_operand" "=x")
+	(unspec:AVXMODEF2P
+	  [(match_operand:AVXMODEF2P 1 "memory_operand" "m")
+	   (match_operand:AVXMODEF2P 2 "register_operand" "x")
+	   (match_dup 0)]
+	  UNSPEC_MASKLOAD))]
+  "TARGET_AVX"
+  "vmaskmovp<avxmodesuffixf2c>\t{%1, %2, %0|%0, %2, %1}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx_maskstorep<avxmodesuffixf2c><avxmodesuffix>"
+  [(set (match_operand:AVXMODEF2P 0 "memory_operand" "=m")
+	(unspec:AVXMODEF2P
+	  [(match_operand:AVXMODEF2P 1 "register_operand" "x")
+	   (match_operand:AVXMODEF2P 2 "register_operand" "x")
+	   (match_dup 0)]
+	  UNSPEC_MASKSTORE))]
+  "TARGET_AVX"
+  "vmaskmovp<avxmodesuffixf2c>\t{%2, %1, %0|%0, %1, %2}"
+  [(set_attr "type" "sselog1")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<MODE>")])
+
+(define_insn "avx_<avxmodesuffixp><avxmodesuffix>_<avxmodesuffixp>"
+  [(set (match_operand:AVX256MODE2P 0 "register_operand" "=x,x")
+	(unspec:AVX256MODE2P
+	  [(match_operand:<avxhalfvecmode> 1 "nonimmediate_operand" "0,xm")]
+	  UNSPEC_CAST))]
+  "TARGET_AVX"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "";
+    case 1:
+      switch (get_attr_mode (insn))
+        {
+	case MODE_V8SF:
+	  return "vmovaps\t{%1, %x0|%x0, %1}";
+	case MODE_V4DF:
+	  return "vmovapd\t{%1, %x0|%x0, %1}";
+	case MODE_OI:
+	  return "vmovdqa\t{%1, %x0|%x0, %1}";
+	default:
+	  break;
+	}
+    default:
+      break;
+    }
+  gcc_unreachable ();
+}
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<avxvecmode>")
+   (set (attr "length")
+    (if_then_else (eq_attr "alternative" "0")
+       (const_string "0")
+       (const_string "*")))])
+
+(define_insn "avx_<avxmodesuffixp>_<avxmodesuffixp><avxmodesuffix>"
+  [(set (match_operand:<avxhalfvecmode> 0 "register_operand" "=x,x")
+	(unspec:<avxhalfvecmode>
+	  [(match_operand:AVX256MODE2P 1 "nonimmediate_operand" "0,xm")]
+	  UNSPEC_CAST))]
+  "TARGET_AVX"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "";
+    case 1:
+      switch (get_attr_mode (insn))
+        {
+	case MODE_V8SF:
+	  return "vmovaps\t{%x1, %0|%0, %x1}";
+	case MODE_V4DF:
+	  return "vmovapd\t{%x1, %0|%0, %x1}";
+	case MODE_OI:
+	  return "vmovdqa\t{%x1, %0|%0, %x1}";
+	default:
+	  break;
+	}
+    default:
+      break;
+    }
+  gcc_unreachable ();
+}
+  [(set_attr "type" "ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<avxvecmode>")
+   (set (attr "length")
+    (if_then_else (eq_attr "alternative" "0")
+       (const_string "0")
+       (const_string "*")))])
+
+(define_expand "vec_init<mode>"
+  [(match_operand:AVX256MODE 0 "register_operand" "")
+   (match_operand 1 "" "")]
+  "TARGET_AVX"
+{
+  ix86_expand_vector_init (false, operands[0], operands[1]);
+  DONE;
+})
+
+(define_insn "*vec_concat<mode>_avx"
+  [(set (match_operand:AVX256MODE 0 "register_operand"   "=x,x")
+	(vec_concat:AVX256MODE
+	  (match_operand:<avxhalfvecmode> 1 "register_operand" "x,x")
+	  (match_operand:<avxhalfvecmode> 2 "vector_move_operand" "xm,C")))]
+  "TARGET_AVX"
+{
+  switch (which_alternative)
+    {
+    case 0:
+      return "vinsertf128\t{$0x1, %2, %t1, %0|%0, %t1, %2, 0x1}";
+    case 1:
+      switch (get_attr_mode (insn))
+        {
+	case MODE_V8SF:
+	  return "vmovaps\t{%1, %x0|%x0, %1}";
+	case MODE_V4DF:
+	  return "vmovapd\t{%1, %x0|%x0, %1}";
+	default:
+	  return "vmovdqa\t{%1, %x0|%x0, %1}";
+	}
+    default:
+      gcc_unreachable ();
+    }
+}
+  [(set_attr "type" "sselog,ssemov")
+   (set_attr "prefix" "vex")
+   (set_attr "mode" "<avxvecmode>")])
-- 
2.7.4