IBM Z: Provide rawmemchr{qi,hi,si} expander
authorStefan Schulze Frielinghaus <stefansf@linux.ibm.com>
Mon, 11 Oct 2021 07:59:32 +0000 (09:59 +0200)
committerStefan Schulze Frielinghaus <stefansf@linux.ibm.com>
Mon, 11 Oct 2021 07:59:32 +0000 (09:59 +0200)
gcc/ChangeLog:

* config/s390/s390-protos.h (s390_rawmemchr): Add prototype.
* config/s390/s390.c (s390_rawmemchr): New function.
* config/s390/s390.md (rawmemchr<SINT:mode>): New expander.
* config/s390/vector.md (@vec_vfees<mode>): Basically a copy of
the pattern vfees<mode> from vx-builtins.md.
* config/s390/vx-builtins.md (*vfees<mode>): Remove.

gcc/testsuite/ChangeLog:

* gcc.target/s390/rawmemchr-1.c: New test.

gcc/config/s390/s390-protos.h
gcc/config/s390/s390.c
gcc/config/s390/s390.md
gcc/config/s390/vector.md
gcc/config/s390/vx-builtins.md
gcc/testsuite/gcc.target/s390/rawmemchr-1.c [new file with mode: 0644]

index 4b03c6e..c161635 100644 (file)
@@ -66,6 +66,8 @@ s390_asm_declare_function_size (FILE *asm_out_file,
                                const char *fnname ATTRIBUTE_UNUSED, tree decl);
 #endif
 
+extern void s390_rawmemchr (machine_mode elt_mode, rtx dst, rtx src, rtx pat);
+
 #ifdef RTX_CODE
 extern int s390_extra_constraint_str (rtx, int, const char *);
 extern int s390_const_ok_for_constraint_p (HOST_WIDE_INT, int, const char *);
index 4f144d1..85dd78c 100644 (file)
@@ -16569,6 +16569,75 @@ s390_excess_precision (enum excess_precision_type type)
 }
 #endif
 
+void
+s390_rawmemchr (machine_mode elt_mode, rtx dst, rtx src, rtx pat)
+{
+  machine_mode vec_mode = mode_for_vector (as_a <scalar_int_mode> (elt_mode),
+                                          16 / GET_MODE_SIZE (elt_mode)).require();
+  rtx lens = gen_reg_rtx (V16QImode);
+  rtx pattern = gen_reg_rtx (vec_mode);
+  rtx loop_start = gen_label_rtx ();
+  rtx loop_end = gen_label_rtx ();
+  rtx addr = gen_reg_rtx (Pmode);
+  rtx offset = gen_reg_rtx (Pmode);
+  rtx loadlen = gen_reg_rtx (SImode);
+  rtx matchlen = gen_reg_rtx (SImode);
+  rtx mem;
+
+  pat = GEN_INT (trunc_int_for_mode (INTVAL (pat), elt_mode));
+  emit_insn (gen_rtx_SET (pattern, gen_rtx_VEC_DUPLICATE (vec_mode, pat)));
+
+  emit_move_insn (addr, XEXP (src, 0));
+
+  // alignment
+  emit_insn (gen_vlbb (lens, gen_rtx_MEM (BLKmode, addr), GEN_INT (6)));
+  emit_insn (gen_lcbb (loadlen, addr, GEN_INT (6)));
+  lens = convert_to_mode (vec_mode, lens, 1);
+  emit_insn (gen_vec_vfees (vec_mode, lens, lens, pattern, GEN_INT (0)));
+  lens = convert_to_mode (V4SImode, lens, 1);
+  emit_insn (gen_vec_extractv4sisi (matchlen, lens, GEN_INT (1)));
+  lens = convert_to_mode (vec_mode, lens, 1);
+  emit_cmp_and_jump_insns (matchlen, loadlen, LT, NULL_RTX, SImode, 1, loop_end);
+  force_expand_binop (Pmode, add_optab, addr, GEN_INT(16), addr, 1, OPTAB_DIRECT);
+  force_expand_binop (Pmode, and_optab, addr, GEN_INT(~HOST_WIDE_INT_UC(0xf)), addr, 1, OPTAB_DIRECT);
+  // now, addr is 16-byte aligned
+
+  mem = gen_rtx_MEM (vec_mode, addr);
+  set_mem_align (mem, 128);
+  emit_move_insn (lens, mem);
+  emit_insn (gen_vec_vfees (vec_mode, lens, lens, pattern, GEN_INT (VSTRING_FLAG_CS)));
+  add_int_reg_note (s390_emit_ccraw_jump (4, EQ, loop_end),
+                   REG_BR_PROB,
+                   profile_probability::very_unlikely ().to_reg_br_prob_note ());
+
+  emit_label (loop_start);
+  LABEL_NUSES (loop_start) = 1;
+
+  force_expand_binop (Pmode, add_optab, addr, GEN_INT (16), addr, 1, OPTAB_DIRECT);
+  mem = gen_rtx_MEM (vec_mode, addr);
+  set_mem_align (mem, 128);
+  emit_move_insn (lens, mem);
+  emit_insn (gen_vec_vfees (vec_mode, lens, lens, pattern, GEN_INT (VSTRING_FLAG_CS)));
+  add_int_reg_note (s390_emit_ccraw_jump (4, NE, loop_start),
+                   REG_BR_PROB,
+                   profile_probability::very_likely ().to_reg_br_prob_note ());
+
+  emit_label (loop_end);
+  LABEL_NUSES (loop_end) = 1;
+
+  if (TARGET_64BIT)
+    {
+      lens = convert_to_mode (V2DImode, lens, 1);
+      emit_insn (gen_vec_extractv2didi (offset, lens, GEN_INT (0)));
+    }
+  else
+    {
+      lens = convert_to_mode (V4SImode, lens, 1);
+      emit_insn (gen_vec_extractv4sisi (offset, lens, GEN_INT (1)));
+    }
+  force_expand_binop (Pmode, add_optab, addr, offset, dst, 1, OPTAB_DIRECT);
+}
+
 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook.  */
 
 static unsigned HOST_WIDE_INT
index 1b894a9..b8bdbae 100644 (file)
                    UNSPECV_PPA)]
   "TARGET_ZEC12"
   "")
+
+(define_expand "rawmemchr<SINT:mode>"
+  [(match_operand      0 "register_operand")
+   (match_operand      1 "memory_operand")
+   (match_operand:SINT 2 "const_int_operand")]
+  "TARGET_VX"
+  "s390_rawmemchr(<SINT:MODE>mode, operands[0], operands[1], operands[2]); DONE;")
index 70274a6..1ed1d06 100644 (file)
   "vll\t%v0,%1,%2"
   [(set_attr "op_type" "VRS")])
 
+; vfeebs, vfeehs, vfeefs
+; vfeezbs, vfeezhs, vfeezfs
+(define_insn "@vec_vfees<mode>"
+  [(set (match_operand:VI_HW_QHS 0 "register_operand" "=v")
+       (unspec:VI_HW_QHS [(match_operand:VI_HW_QHS 1 "register_operand" "v")
+                          (match_operand:VI_HW_QHS 2 "register_operand" "v")
+                          (match_operand:QI 3 "const_mask_operand" "C")]
+                         UNSPEC_VEC_VFEE))
+   (set (reg:CCRAW CC_REGNUM)
+       (unspec:CCRAW [(match_dup 1)
+                      (match_dup 2)
+                      (match_dup 3)]
+                     UNSPEC_VEC_VFEECC))]
+  "TARGET_VX"
+{
+  unsigned HOST_WIDE_INT flags = UINTVAL (operands[3]);
+
+  gcc_assert (!(flags & ~(VSTRING_FLAG_ZS | VSTRING_FLAG_CS)));
+  flags &= ~VSTRING_FLAG_CS;
+
+  if (flags == VSTRING_FLAG_ZS)
+    return "vfeez<bhfgq>s\t%v0,%v1,%v2";
+  return "vfee<bhfgq>s\t%v0,%v1,%v2";
+}
+  [(set_attr "op_type" "VRR")])
+
 ; vfenebs, vfenehs, vfenefs
 ; vfenezbs, vfenezhs, vfenezfs
 (define_insn "vec_vfenes<mode>"
index 3e7b854..efa7799 100644 (file)
 
 ; Vector find element equal
 
-; vfeebs, vfeehs, vfeefs
-; vfeezbs, vfeezhs, vfeezfs
-(define_insn "*vfees<mode>"
-  [(set (match_operand:VI_HW_QHS 0 "register_operand" "=v")
-       (unspec:VI_HW_QHS [(match_operand:VI_HW_QHS 1 "register_operand" "v")
-                          (match_operand:VI_HW_QHS 2 "register_operand" "v")
-                          (match_operand:QI 3 "const_mask_operand" "C")]
-                         UNSPEC_VEC_VFEE))
-   (set (reg:CCRAW CC_REGNUM)
-       (unspec:CCRAW [(match_dup 1)
-                      (match_dup 2)
-                      (match_dup 3)]
-                     UNSPEC_VEC_VFEECC))]
-  "TARGET_VX"
-{
-  unsigned HOST_WIDE_INT flags = UINTVAL (operands[3]);
-
-  gcc_assert (!(flags & ~(VSTRING_FLAG_ZS | VSTRING_FLAG_CS)));
-  flags &= ~VSTRING_FLAG_CS;
-
-  if (flags == VSTRING_FLAG_ZS)
-    return "vfeez<bhfgq>s\t%v0,%v1,%v2";
-  return "vfee<bhfgq>s\t%v0,%v1,%v2,%b3";
-}
-  [(set_attr "op_type" "VRR")])
-
 ; vfeeb, vfeeh, vfeef
 (define_insn "vfee<mode>"
   [(set (match_operand:VI_HW_QHS                    0 "register_operand" "=v")
diff --git a/gcc/testsuite/gcc.target/s390/rawmemchr-1.c b/gcc/testsuite/gcc.target/s390/rawmemchr-1.c
new file mode 100644 (file)
index 0000000..a512570
--- /dev/null
@@ -0,0 +1,99 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ftree-loop-distribution -fdump-tree-ldist-details -mzarch -march=z13" } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrQI" 2 "ldist" } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrHI" 2 "ldist" } } */
+/* { dg-final { scan-tree-dump-times "generated rawmemchrSI" 2 "ldist" } } */
+
+#include <string.h>
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#define rawmemchrT(T, pattern)     \
+__attribute__((noinline,noclone))  \
+T* rawmemchr_##T (T *s)            \
+{                                  \
+  while (*s != pattern)            \
+    ++s;                           \
+  return s;                        \
+}
+
+rawmemchrT(int8_t, (int8_t)0xde)
+rawmemchrT(uint8_t, 0xde)
+rawmemchrT(int16_t, (int16_t)0xdead)
+rawmemchrT(uint16_t, 0xdead)
+rawmemchrT(int32_t, (int32_t)0xdeadbeef)
+rawmemchrT(uint32_t, 0xdeadbeef)
+
+#define runT(T, pattern)                           \
+void run_##T ()                                    \
+{                                                  \
+  T *buf = malloc (4096 * 2 * sizeof(T));          \
+  assert (buf != NULL);                            \
+  memset (buf, 0xa, 4096 * 2 * sizeof(T));         \
+  /* ensure q is 4096-byte aligned */              \
+  T *q = (T*)((unsigned char *)buf                 \
+              + (4096 - ((uintptr_t)buf & 4095))); \
+  T *p;                                            \
+  /* unaligned + block boundary + 1st load */      \
+  p = (T *) ((uintptr_t)q - 8);                    \
+  p[2] = pattern;                                  \
+  assert ((rawmemchr_##T (&p[0]) == &p[2]));       \
+  p[2] = (T) 0xaaaaaaaa;                           \
+  /* unaligned + block boundary + 2nd load */      \
+  p = (T *) ((uintptr_t)q - 8);                    \
+  p[6] = pattern;                                  \
+  assert ((rawmemchr_##T (&p[0]) == &p[6]));       \
+  p[6] = (T) 0xaaaaaaaa;                           \
+  /* unaligned + 1st load */                       \
+  q[5] = pattern;                                  \
+  assert ((rawmemchr_##T (&q[2]) == &q[5]));       \
+  q[5] = (T) 0xaaaaaaaa;                           \
+  /* unaligned + 2nd load */                       \
+  q[14] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[2]) == &q[14]));      \
+  q[14] = (T) 0xaaaaaaaa;                          \
+  /* unaligned + 3rd load */                       \
+  q[19] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[2]) == &q[19]));      \
+  q[19] = (T) 0xaaaaaaaa;                          \
+  /* unaligned + 4th load */                       \
+  q[25] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[2]) == &q[25]));      \
+  q[25] = (T) 0xaaaaaaaa;                          \
+  /* aligned + 1st load */                         \
+  q[5] = pattern;                                  \
+  assert ((rawmemchr_##T (&q[0]) == &q[5]));       \
+  q[5] = (T) 0xaaaaaaaa;                           \
+  /* aligned + 2nd load */                         \
+  q[14] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[0]) == &q[14]));      \
+  q[14] = (T) 0xaaaaaaaa;                          \
+  /* aligned + 3rd load */                         \
+  q[19] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[0]) == &q[19]));      \
+  q[19] = (T) 0xaaaaaaaa;                          \
+  /* aligned + 4th load */                         \
+  q[25] = pattern;                                 \
+  assert ((rawmemchr_##T (&q[0]) == &q[25]));      \
+  q[25] = (T) 0xaaaaaaaa;                          \
+  free (buf);                                      \
+}
+
+runT(int8_t, (int8_t)0xde)
+runT(uint8_t, 0xde)
+runT(int16_t, (int16_t)0xdead)
+runT(uint16_t, 0xdead)
+runT(int32_t, (int32_t)0xdeadbeef)
+runT(uint32_t, 0xdeadbeef)
+
+int main (void)
+{
+  run_uint8_t ();
+  run_int8_t ();
+  run_uint16_t ();
+  run_int16_t ();
+  run_uint32_t ();
+  run_int32_t ();
+  return 0;
+}