From 3edc21af5272194794fbf24b2c5f0981c632e866 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 14 May 2020 13:06:23 -0700 Subject: [PATCH] x86: Add cmpmemsi for -minline-all-stringops MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit We used to expand memcmp to "repz cmpsb" via cmpstrnsi.  It was changed by commit 9b0f6f5e511ca512e4faeabc81d2fd3abad9b02f Author: Nick Clifton Date: Fri Aug 12 16:26:11 2011 +0000 builtins.c (expand_builtin_memcmp): Do not use cmpstrnsi pattern. * builtins.c (expand_builtin_memcmp): Do not use cmpstrnsi pattern. * doc/md.texi (cmpstrn): Note that the comparison stops if both fetched bytes are zero. (cmpstr): Likewise. (cmpmem): Note that the comparison does not stop if both of the fetched bytes are zero. Duplicate the cmpstrn pattern for cmpmem. The only difference is that the length argument of cmpmem is guaranteed to be less than or equal to lengths of 2 memory areas. Since "repz cmpsb" can be much slower than memcmp function implemented with vector instruction, see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052 expand cmpmem to "repz cmpsb" only for -minline-all-stringops. gcc/ PR target/95151 * config/i386/i386-expand.c (ix86_expand_cmpstrn_or_cmpmem): New function. * config/i386/i386-protos.h (ix86_expand_cmpstrn_or_cmpmem): New prototype. * config/i386/i386.md (cmpmemsi): New pattern. gcc/testsuite/ PR target/95151 * gcc.target/i386/pr95151-1.c: New test. * gcc.target/i386/pr95151-2.c: Likewise. * gcc.target/i386/pr95151-3.c: Likewise. * gcc.target/i386/pr95151-4.c: Likewise. --- gcc/config/i386/i386-expand.c | 84 +++++++++++++++++++++++++++++++ gcc/config/i386/i386-protos.h | 1 + gcc/config/i386/i386.md | 84 ++++++++----------------------- gcc/testsuite/gcc.target/i386/pr95151-1.c | 17 +++++++ gcc/testsuite/gcc.target/i386/pr95151-2.c | 10 ++++ gcc/testsuite/gcc.target/i386/pr95151-3.c | 18 +++++++ gcc/testsuite/gcc.target/i386/pr95151-4.c | 11 ++++ 7 files changed, 162 insertions(+), 63 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr95151-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr95151-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr95151-3.c create mode 100644 gcc/testsuite/gcc.target/i386/pr95151-4.c diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 3e8afe6..bd83a8e 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -7673,6 +7673,90 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp, return true; } +/* Expand cmpstrn or memcmp. */ + +bool +ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2, + rtx length, rtx align, bool is_cmpstrn) +{ + if (optimize_insn_for_size_p () && !TARGET_INLINE_ALL_STRINGOPS) + return false; + + /* Can't use this if the user has appropriated ecx, esi or edi. */ + if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG]) + return false; + + if (is_cmpstrn) + { + /* For strncmp, length is the maximum length, which can be larger + than actual string lengths. We can expand the cmpstrn pattern + to "repz cmpsb" only if one of the strings is a constant so + that expand_builtin_strncmp() can write the length argument to + be the minimum of the const string length and the actual length + argument. Otherwise, "repz cmpsb" may pass the 0 byte. */ + tree t1 = MEM_EXPR (src1); + tree t2 = MEM_EXPR (src2); + if (!((t1 && TREE_CODE (t1) == MEM_REF + && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR + && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0)) + == STRING_CST)) + || (t2 && TREE_CODE (t2) == MEM_REF + && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR + && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0)) + == STRING_CST)))) + return false; + } + else + { + /* Expand memcmp to "repz cmpsb" only for -minline-all-stringops + since "repz cmpsb" can be much slower than memcmp function + implemented with vector instructions, see + + https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052 + */ + if (!TARGET_INLINE_ALL_STRINGOPS) + return false; + } + + rtx addr1 = copy_addr_to_reg (XEXP (src1, 0)); + rtx addr2 = copy_addr_to_reg (XEXP (src2, 0)); + if (addr1 != XEXP (src1, 0)) + src1 = replace_equiv_address_nv (src1, addr1); + if (addr2 != XEXP (src2, 0)) + src2 = replace_equiv_address_nv (src2, addr2); + + /* NB: Make a copy of the data length to avoid changing the original + data length by cmpstrnqi patterns. */ + length = ix86_zero_extend_to_Pmode (length); + rtx lengthreg = gen_reg_rtx (Pmode); + emit_move_insn (lengthreg, length); + + /* If we are testing strict equality, we can use known alignment to + good advantage. This may be possible with combine, particularly + once cc0 is dead. */ + if (CONST_INT_P (length)) + { + if (length == const0_rtx) + { + emit_move_insn (result, const0_rtx); + return true; + } + emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align, + src1, src2)); + } + else + { + emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg)); + emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align, + src1, src2)); + } + + rtx out = gen_lowpart (QImode, result); + emit_insn (gen_cmpintqi (out)); + emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out)); + + return true; +} /* Expand the appropriate insns for doing strlen if not just doing repnz; scasb diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index c5b700e..69e2b32 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -71,6 +71,7 @@ extern int avx_vperm2f128_parallel (rtx par, machine_mode mode); extern bool ix86_expand_strlen (rtx, rtx, rtx, rtx); extern bool ix86_expand_set_or_cpymem (rtx, rtx, rtx, rtx, rtx, rtx, rtx, rtx, rtx, rtx, bool); +extern bool ix86_expand_cmpstrn_or_cmpmem (rtx, rtx, rtx, rtx, rtx, bool); extern bool constant_address_p (rtx); extern bool legitimate_pic_operand_p (rtx); diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 8730816..751801d 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/config/i386/i386.md @@ -18115,6 +18115,22 @@ (const_string "*"))) (set_attr "mode" "QI")]) +(define_expand "cmpmemsi" + [(set (match_operand:SI 0 "register_operand" "") + (compare:SI (match_operand:BLK 1 "memory_operand" "") + (match_operand:BLK 2 "memory_operand" "") ) ) + (use (match_operand 3 "general_operand")) + (use (match_operand 4 "immediate_operand"))] + "" +{ + if (ix86_expand_cmpstrn_or_cmpmem (operands[0], operands[1], + operands[2], operands[3], + operands[4], false)) + DONE; + else + FAIL; +}) + (define_expand "cmpstrnsi" [(set (match_operand:SI 0 "register_operand") (compare:SI (match_operand:BLK 1 "general_operand") @@ -18123,70 +18139,12 @@ (use (match_operand 4 "immediate_operand"))] "" { - rtx addr1, addr2, countreg, align, out; - - if (optimize_insn_for_size_p () && !TARGET_INLINE_ALL_STRINGOPS) - FAIL; - - /* Can't use this if the user has appropriated ecx, esi or edi. */ - if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG]) - FAIL; - - /* One of the strings must be a constant. If so, expand_builtin_strncmp() - will have rewritten the length arg to be the minimum of the const string - length and the actual length arg. If both strings are the same and - shorter than the length arg, repz cmpsb will not stop at the 0 byte and - will incorrectly base the results on chars past the 0 byte. */ - tree t1 = MEM_EXPR (operands[1]); - tree t2 = MEM_EXPR (operands[2]); - if (!((t1 && TREE_CODE (t1) == MEM_REF - && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR - && TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0)) == STRING_CST) - || (t2 && TREE_CODE (t2) == MEM_REF - && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR - && TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0)) == STRING_CST))) - FAIL; - - addr1 = copy_addr_to_reg (XEXP (operands[1], 0)); - addr2 = copy_addr_to_reg (XEXP (operands[2], 0)); - if (addr1 != XEXP (operands[1], 0)) - operands[1] = replace_equiv_address_nv (operands[1], addr1); - if (addr2 != XEXP (operands[2], 0)) - operands[2] = replace_equiv_address_nv (operands[2], addr2); - - /* NB: Make a copy of the data length to avoid changing the original - data length by cmpstrnqi patterns. */ - rtx count = ix86_zero_extend_to_Pmode (operands[3]); - countreg = gen_reg_rtx (Pmode); - emit_move_insn (countreg, count); - - /* %%% Iff we are testing strict equality, we can use known alignment - to good advantage. This may be possible with combine, particularly - once cc0 is dead. */ - align = operands[4]; - - if (CONST_INT_P (operands[3])) - { - if (operands[3] == const0_rtx) - { - emit_move_insn (operands[0], const0_rtx); - DONE; - } - emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, countreg, align, - operands[1], operands[2])); - } + if (ix86_expand_cmpstrn_or_cmpmem (operands[0], operands[1], + operands[2], operands[3], + operands[4], true)) + DONE; else - { - emit_insn (gen_cmp_1 (Pmode, countreg, countreg)); - emit_insn (gen_cmpstrnqi_1 (addr1, addr2, countreg, align, - operands[1], operands[2])); - } - - out = gen_lowpart (QImode, operands[0]); - emit_insn (gen_cmpintqi (out)); - emit_move_insn (operands[0], gen_rtx_SIGN_EXTEND (SImode, out)); - - DONE; + FAIL; }) ;; Produce a tri-state integer (-1, 0, 1) from condition codes. diff --git a/gcc/testsuite/gcc.target/i386/pr95151-1.c b/gcc/testsuite/gcc.target/i386/pr95151-1.c new file mode 100644 index 0000000..54a7510 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr95151-1.c @@ -0,0 +1,17 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -minline-all-stringops" } */ + +struct foo +{ + char array[257]; +}; + +extern struct foo x; + +int +func (struct foo i) +{ + return __builtin_memcmp (&x, &i, sizeof (x)) ? 1 : 2; +} + +/* { dg-final { scan-assembler-not "call\[\\t \]*_?memcmp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr95151-2.c b/gcc/testsuite/gcc.target/i386/pr95151-2.c new file mode 100644 index 0000000..8f9d8ee --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr95151-2.c @@ -0,0 +1,10 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -minline-all-stringops" } */ + +int +func (void *d, void *s, unsigned int l) +{ + return __builtin_memcmp (d, s, l) ? 1 : 2; +} + +/* { dg-final { scan-assembler-not "call\[\\t \]*_?memcmp" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr95151-3.c b/gcc/testsuite/gcc.target/i386/pr95151-3.c new file mode 100644 index 0000000..14cbdec --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr95151-3.c @@ -0,0 +1,18 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse2 -mno-inline-all-stringops" } */ + +struct foo +{ + char array[257]; +}; + +extern struct foo x; + +int +func (struct foo i) +{ + return __builtin_memcmp (&x, &i, sizeof (x)) ? 1 : 2; +} + +/* { dg-final { scan-assembler "call\[\\t \]*_?memcmp" } } */ +/* { dg-final { scan-assembler-not "cmpsb" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr95151-4.c b/gcc/testsuite/gcc.target/i386/pr95151-4.c new file mode 100644 index 0000000..c93b2b6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr95151-4.c @@ -0,0 +1,11 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -msse2 -mno-inline-all-stringops" } */ + +int +func (void *d, void *s, unsigned int l) +{ + return __builtin_memcmp (d, s, l) ? 1 : 2; +} + +/* { dg-final { scan-assembler "call\[\\t \]*_?memcmp" } } */ +/* { dg-final { scan-assembler-not "cmpsb" } } */ -- 2.7.4