* config/i386/i386.c (avx_vperm2f128_parallel): New.
* config/i386/i386-protos.h: Declare it.
* config/i386/predicates.md (avx_vperm2f128_v8sf_operand,
avx_vperm2f128_v8si_operand, avx_vperm2f128_v4df_operand): New.
* config/i386/sse.md (avx_vperm2f128<mode>3): Change to expander.
(*avx_vperm2f128<mode>_full): Renamed from avx_vperm2f128<mode>3.
(*avx_vperm2f128<mode>_nozero): New.
From-SVN: r154832
2009-11-30 Richard Henderson <rth@redhat.com>
+ * config/i386/i386.c (avx_vperm2f128_parallel): New.
+ * config/i386/i386-protos.h: Declare it.
+ * config/i386/predicates.md (avx_vperm2f128_v8sf_operand,
+ avx_vperm2f128_v8si_operand, avx_vperm2f128_v4df_operand): New.
+ * config/i386/sse.md (avx_vperm2f128<mode>3): Change to expander.
+ (*avx_vperm2f128<mode>_full): Renamed from avx_vperm2f128<mode>3.
+ (*avx_vperm2f128<mode>_nozero): New.
+
+2009-11-30 Richard Henderson <rth@redhat.com>
+
* config/i386/i386-builtin-types.def (V4DF_FTYPE_V4DF_V4DF_V4DI): New.
(V8SF_FTYPE_V8SF_V8SF_V8SI): New.
* config/i386/i386.c (ix86_vectorize_builtin_vec_perm): Support
extern enum machine_mode ix86_cc_mode (enum rtx_code, rtx, rtx);
extern int avx_vpermilp_parallel (rtx par, enum machine_mode mode);
+extern int avx_vperm2f128_parallel (rtx par, enum machine_mode mode);
extern int ix86_expand_movmem (rtx, rtx, rtx, rtx, rtx, rtx);
extern int ix86_expand_setmem (rtx, rtx, rtx, rtx, rtx, rtx);
/* Make sure success has a non-zero value by adding one. */
return mask + 1;
}
+
+/* Helper for avx_vperm2f128_v4df_operand et al. This is also used by
+ the expansion functions to turn the parallel back into a mask.
+ The return value is 0 for no match and the imm8+1 for a match. */
+
+int
+avx_vperm2f128_parallel (rtx par, enum machine_mode mode)
+{
+ unsigned i, nelt = GET_MODE_NUNITS (mode), nelt2 = nelt / 2;
+ unsigned mask = 0;
+ unsigned char ipar[8];
+
+ if (XVECLEN (par, 0) != (int) nelt)
+ return 0;
+
+ /* Validate that all of the elements are constants, and not totally
+ out of range. Copy the data into an integral array to make the
+ subsequent checks easier. */
+ for (i = 0; i < nelt; ++i)
+ {
+ rtx er = XVECEXP (par, 0, i);
+ unsigned HOST_WIDE_INT ei;
+
+ if (!CONST_INT_P (er))
+ return 0;
+ ei = INTVAL (er);
+ if (ei >= 2 * nelt)
+ return 0;
+ ipar[i] = ei;
+ }
+
+ /* Validate that the halves of the permute are halves. */
+ for (i = 0; i < nelt2 - 1; ++i)
+ if (ipar[i] + 1 != ipar[i + 1])
+ return 0;
+ for (i = nelt2; i < nelt - 1; ++i)
+ if (ipar[i] + 1 != ipar[i + 1])
+ return 0;
+
+ /* Reconstruct the mask. */
+ for (i = 0; i < 2; ++i)
+ {
+ unsigned e = ipar[i * nelt2];
+ if (e % nelt2)
+ return 0;
+ e /= nelt2;
+ mask |= e << (i * 4);
+ }
+
+ /* Make sure success has a non-zero value by adding one. */
+ return mask + 1;
+}
\f
/* Store OPERAND to the memory after reload is completed. This means
(define_predicate "avx_vpermilp_v2df_operand"
(and (match_code "parallel")
(match_test "avx_vpermilp_parallel (op, V2DFmode)")))
+
+;; Return 1 if OP is a parallel for a vperm2f128 permute.
+
+(define_predicate "avx_vperm2f128_v8sf_operand"
+ (and (match_code "parallel")
+ (match_test "avx_vperm2f128_parallel (op, V8SFmode)")))
+
+(define_predicate "avx_vperm2f128_v8si_operand"
+ (and (match_code "parallel")
+ (match_test "avx_vperm2f128_parallel (op, V8SImode)")))
+
+(define_predicate "avx_vperm2f128_v4df_operand"
+ (and (match_code "parallel")
+ (match_test "avx_vperm2f128_parallel (op, V4DFmode)")))
(set_attr "prefix" "vex")
(set_attr "mode" "<MODE>")])
-(define_insn "avx_vperm2f128<mode>3"
+(define_expand "avx_vperm2f128<mode>3"
+ [(set (match_operand:AVX256MODE2P 0 "register_operand" "")
+ (unspec:AVX256MODE2P
+ [(match_operand:AVX256MODE2P 1 "register_operand" "")
+ (match_operand:AVX256MODE2P 2 "nonimmediate_operand" "")
+ (match_operand:SI 3 "const_0_to_255_operand" "")]
+ UNSPEC_VPERMIL2F128))]
+ "TARGET_AVX"
+{
+ int mask = INTVAL (operands[2]);
+ if ((mask & 0x88) == 0)
+ {
+ rtx perm[<ssescalarnum>], t1, t2;
+ int i, base, nelt = <ssescalarnum>, nelt2 = nelt / 2;
+
+ base = (mask & 3) * nelt2;
+ for (i = 0; i < nelt2; ++i)
+ perm[i] = GEN_INT (base + i);
+
+ base = ((mask >> 4) & 3) * nelt2;
+ for (i = 0; i < nelt2; ++i)
+ perm[i + nelt2] = GEN_INT (base + i);
+
+ t2 = gen_rtx_VEC_CONCAT (<ssedoublesizemode>mode,
+ operands[1], operands[2]);
+ t1 = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (nelt, perm));
+ t2 = gen_rtx_VEC_SELECT (<MODE>mode, t2, t1);
+ t2 = gen_rtx_SET (VOIDmode, operands[0], t2);
+ emit_insn (t2);
+ DONE;
+ }
+})
+
+;; Note that bits 7 and 3 of the imm8 allow lanes to be zeroed, which
+;; means that in order to represent this properly in rtl we'd have to
+;; nest *another* vec_concat with a zero operand and do the select from
+;; a 4x wide vector. That doesn't seem very nice.
+(define_insn "*avx_vperm2f128<mode>_full"
[(set (match_operand:AVX256MODE2P 0 "register_operand" "=x")
(unspec:AVX256MODE2P
[(match_operand:AVX256MODE2P 1 "register_operand" "x")
(set_attr "prefix" "vex")
(set_attr "mode" "V8SF")])
+(define_insn "*avx_vperm2f128<mode>_nozero"
+ [(set (match_operand:AVX256MODE2P 0 "register_operand" "=x")
+ (vec_select:AVX256MODE2P
+ (vec_concat:<ssedoublesizemode>
+ (match_operand:AVX256MODE2P 1 "register_operand" "x")
+ (match_operand:AVX256MODE2P 2 "nonimmediate_operand" "xm"))
+ (match_parallel 3 "avx_vperm2f128_<mode>_operand"
+ [(match_operand 4 "const_int_operand" "")])))]
+ "TARGET_AVX"
+{
+ int mask = avx_vperm2f128_parallel (operands[3], <MODE>mode) - 1;
+ operands[3] = GEN_INT (mask);
+ return "vperm2f128\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+}
+ [(set_attr "type" "sselog")
+ (set_attr "prefix_extra" "1")
+ (set_attr "length_immediate" "1")
+ (set_attr "prefix" "vex")
+ (set_attr "mode" "V8SF")])
+
(define_insn "avx_vbroadcasts<avxmodesuffixf2c><avxmodesuffix>"
[(set (match_operand:AVXMODEF4P 0 "register_operand" "=x")
(vec_concat:AVXMODEF4P