From 64e6863ed571bbd89eb7a65e3727a2a4ecc1ea27 Mon Sep 17 00:00:00 2001 From: Richard Henderson Date: Tue, 17 May 2005 01:15:41 -0700 Subject: [PATCH] * config/i386/sse.md (mulv16qi3, mulv2di3): New. From-SVN: r99824 --- gcc/ChangeLog | 4 ++ gcc/config/i386/sse.md | 88 ++++++++++++++++++++++++++++++++++++ gcc/testsuite/gcc.dg/vect/vect-100.c | 28 ++++++++++++ 3 files changed, 120 insertions(+) create mode 100644 gcc/testsuite/gcc.dg/vect/vect-100.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index dc64af2..82233e9 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,7 @@ +2005-05-17 Richard Henderson + + * config/i386/sse.md (mulv16qi3, mulv2di3): New. + 2005-05-17 Jakub Jelinek PR middle-end/21492 diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 76efe5f..5ff94ba 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -2392,6 +2392,50 @@ [(set_attr "type" "sseiadd") (set_attr "mode" "TI")]) +(define_expand "mulv16qi3" + [(set (match_operand:V16QI 0 "register_operand" "") + (mult:V16QI (match_operand:V16QI 1 "register_operand" "") + (match_operand:V16QI 2 "register_operand" "")))] + "TARGET_SSE2" +{ + rtx t[12], op0; + int i; + + for (i = 0; i < 12; ++i) + t[i] = gen_reg_rtx (V16QImode); + + /* Unpack data such that we've got a source byte in each low byte of + each word. We don't care what goes into the high byte of each word. + Rather than trying to get zero in there, most convenient is to let + it be a copy of the low byte. */ + emit_insn (gen_sse2_punpckhbw (t[0], operands[1], operands[1])); + emit_insn (gen_sse2_punpckhbw (t[1], operands[2], operands[2])); + emit_insn (gen_sse2_punpcklbw (t[2], operands[1], operands[1])); + emit_insn (gen_sse2_punpcklbw (t[3], operands[2], operands[2])); + + /* Multiply words. The end-of-line annotations here give a picture of what + the output of that instruction looks like. Dot means don't care; the + letters are the bytes of the result with A being the most significant. */ + emit_insn (gen_mulv8hi3 (gen_lowpart (V8HImode, t[4]), /* .A.B.C.D.E.F.G.H */ + gen_lowpart (V8HImode, t[0]), + gen_lowpart (V8HImode, t[1]))); + emit_insn (gen_mulv8hi3 (gen_lowpart (V8HImode, t[5]), /* .I.J.K.L.M.N.O.P */ + gen_lowpart (V8HImode, t[2]), + gen_lowpart (V8HImode, t[3]))); + + /* Extract the relevant bytes and merge them back together. */ + emit_insn (gen_sse2_punpckhbw (t[6], t[5], t[4])); /* ..AI..BJ..CK..DL */ + emit_insn (gen_sse2_punpcklbw (t[7], t[5], t[4])); /* ..EM..FN..GO..HP */ + emit_insn (gen_sse2_punpckhbw (t[8], t[7], t[6])); /* ....AEIM....BFJN */ + emit_insn (gen_sse2_punpcklbw (t[9], t[7], t[6])); /* ....CGKO....DHLP */ + emit_insn (gen_sse2_punpckhbw (t[10], t[9], t[8])); /* ........ACEGIKMO */ + emit_insn (gen_sse2_punpcklbw (t[11], t[9], t[8])); /* ........BDFHJLNP */ + + op0 = operands[0]; + emit_insn (gen_sse2_punpcklbw (op0, t[11], t[10])); /* ABCDEFGHIJKLMNOP */ + DONE; +}) + (define_expand "mulv8hi3" [(set (match_operand:V8HI 0 "register_operand" "") (mult:V8HI (match_operand:V8HI 1 "nonimmediate_operand" "") @@ -2536,6 +2580,50 @@ DONE; }) +(define_expand "mulv2di3" + [(set (match_operand:V2DI 0 "register_operand" "") + (mult:V2DI (match_operand:V2DI 1 "nonimmediate_operand" "") + (match_operand:V2DI 2 "nonimmediate_operand" "")))] + "TARGET_SSE2" +{ + rtx t1, t2, t3, t4, t5, t6, thirtytwo; + rtx op0, op1, op2; + + op0 = operands[0]; + op1 = operands[1]; + op2 = operands[2]; + t1 = gen_reg_rtx (V2DImode); + t2 = gen_reg_rtx (V2DImode); + t3 = gen_reg_rtx (V2DImode); + t4 = gen_reg_rtx (V2DImode); + t5 = gen_reg_rtx (V2DImode); + t6 = gen_reg_rtx (V2DImode); + thirtytwo = GEN_INT (32); + + /* Multiply low parts. */ + emit_insn (gen_sse2_umulv2siv2di3 (t1, gen_lowpart (V4SImode, op1), + gen_lowpart (V4SImode, op2))); + + /* Shift input vectors left 32 bits so we can multiply high parts. */ + emit_insn (gen_lshrv2di3 (t2, op1, thirtytwo)); + emit_insn (gen_lshrv2di3 (t3, op2, thirtytwo)); + + /* Multiply high parts by low parts. */ + emit_insn (gen_sse2_umulv2siv2di3 (t4, gen_lowpart (V4SImode, op1), + gen_lowpart (V4SImode, t3))); + emit_insn (gen_sse2_umulv2siv2di3 (t5, gen_lowpart (V4SImode, op2), + gen_lowpart (V4SImode, t2))); + + /* Shift them back. */ + emit_insn (gen_ashlv2di3 (t4, t4, thirtytwo)); + emit_insn (gen_ashlv2di3 (t5, t5, thirtytwo)); + + /* Add the three parts together. */ + emit_insn (gen_addv2di3 (t6, t1, t4)); + emit_insn (gen_addv2di3 (op0, t6, t5)); + DONE; +}) + (define_insn "ashr3" [(set (match_operand:SSEMODE24 0 "register_operand" "=x") (ashiftrt:SSEMODE24 diff --git a/gcc/testsuite/gcc.dg/vect/vect-100.c b/gcc/testsuite/gcc.dg/vect/vect-100.c new file mode 100644 index 0000000..3b803fc --- /dev/null +++ b/gcc/testsuite/gcc.dg/vect/vect-100.c @@ -0,0 +1,28 @@ +/* Assuming we can vectorize char multiplication, here's an execute test. */ + +#include +#include "tree-vect.h" + +extern void abort (void); +void foo() +{ + static unsigned char A[256], B[256], C[256]; + int i; + + for (i = 0; i < 256; ++i) + A[i] = B[i] = i; + + for (i = 0; i < 256; ++i) + C[i] = A[i] * B[i]; + + for (i = 0; i < 256; ++i) + if (C[i] != (unsigned char)(i * i)) + abort (); +} + +int main() +{ + check_vect (); + foo(); + return 0; +} -- 2.7.4