RISC-V: Fix inferior codegen for vse intrinsics.

author Ju-Zhe Zhong <juzhe.zhong@rivai.ai>

Thu, 29 Dec 2022 15:34:02 +0000 (23:34 +0800)

committer Kito Cheng <kito.cheng@sifive.com>

Thu, 26 Jan 2023 18:52:40 +0000 (02:52 +0800)
author Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
Thu, 29 Dec 2022 15:34:02 +0000 (23:34 +0800)
committer Kito Cheng <kito.cheng@sifive.com>
Thu, 26 Jan 2023 18:52:40 +0000 (02:52 +0800)
diff --git a/gcc/config/riscv/riscv-vector-builtins-bases.cc b/gcc/config/riscv/riscv-vector-builtins-bases.cc

index ee923da..5cd8e4d 100644 (file)
--- a/gcc/config/riscv/riscv-vector-builtins-bases.cc
+++ b/gcc/config/riscv/riscv-vector-builtins-bases.cc
@@ -106,7 +106,7 @@ class loadstore : public function_base
    rtx expand (function_expander &e) const override
    {
      if (STORE_P)
-      return e.use_contiguous_store_insn (code_for_pred_mov (e.vector_mode ()));
+      return e.use_contiguous_store_insn (code_for_pred_store (e.vector_mode ()));
      else
        return e.use_contiguous_load_insn (code_for_pred_mov (e.vector_mode ()));
    }
diff --git a/gcc/config/riscv/riscv-vector-builtins.cc b/gcc/config/riscv/riscv-vector-builtins.cc

index d7a9fe4..4c5ecce 100644 (file)
--- a/gcc/config/riscv/riscv-vector-builtins.cc
+++ b/gcc/config/riscv/riscv-vector-builtins.cc
@@ -845,15 +845,15 @@ function_expander::add_vundef_operand (machine_mode mode)
  }
  
  /* Add a memory operand with mode MODE and address ADDR.  */
-rtx
-function_expander::add_mem_operand (machine_mode mode, rtx addr)
+void
+function_expander::add_mem_operand (machine_mode mode, unsigned argno)
  {
    gcc_assert (VECTOR_MODE_P (mode));
+  rtx addr = expand_normal (CALL_EXPR_ARG (exp, argno));
    rtx mem = gen_rtx_MEM (mode, memory_address (mode, addr));
    /* The memory is only guaranteed to be element-aligned.  */
    set_mem_align (mem, GET_MODE_ALIGNMENT (GET_MODE_INNER (mode)));
    add_fixed_operand (mem);
-  return mem;
  }
  
  /* Use contiguous load INSN.  */
@@ -878,9 +878,7 @@ function_expander::use_contiguous_load_insn (insn_code icode)
    else
      add_vundef_operand (mode);
  
-  tree addr_arg = CALL_EXPR_ARG (exp, arg_offset++);
-  rtx addr = expand_normal (addr_arg);
-  add_mem_operand (mode, addr);
+  add_mem_operand (mode, arg_offset++);
  
    for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++)
      add_input_operand (argno);
@@ -904,27 +902,17 @@ function_expander::use_contiguous_store_insn (insn_code icode)
    /* Record the offset to get the argument.  */
    int arg_offset = 0;
  
-  int addr_loc = use_real_mask_p (pred) ? 1 : 0;
-  tree addr_arg = CALL_EXPR_ARG (exp, addr_loc);
-  rtx addr = expand_normal (addr_arg);
-  rtx mem = add_mem_operand (mode, addr);
+  add_mem_operand (mode, use_real_mask_p (pred) ? 1 : 0);
  
    if (use_real_mask_p (pred))
      add_input_operand (arg_offset++);
    else
      add_all_one_mask_operand (mask_mode);
  
-  /* To model "+m" constraint, we include memory operand into input.  */
-  add_input_operand (mode, mem);
-
    arg_offset++;
    for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++)
      add_input_operand (argno);
  
-  add_input_operand (Pmode, get_tail_policy_for_pred (pred));
-  add_input_operand (Pmode, get_mask_policy_for_pred (pred));
-  add_input_operand (Pmode, get_avl_type_rtx (avl_type::NONVLMAX));
-
    return generate_insn (icode);
  }
  
diff --git a/gcc/config/riscv/riscv-vector-builtins.h b/gcc/config/riscv/riscv-vector-builtins.h

index bd33095..fb3f818 100644 (file)
--- a/gcc/config/riscv/riscv-vector-builtins.h
+++ b/gcc/config/riscv/riscv-vector-builtins.h
@@ -317,12 +317,12 @@ public:
    rtx expand ();
  
    void add_input_operand (machine_mode, rtx);
-  void add_input_operand (unsigned argno);
+  void add_input_operand (unsigned);
    void add_output_operand (machine_mode, rtx);
-  void add_all_one_mask_operand (machine_mode mode);
-  void add_vundef_operand (machine_mode mode);
+  void add_all_one_mask_operand (machine_mode);
+  void add_vundef_operand (machine_mode);
    void add_fixed_operand (rtx);
-  rtx add_mem_operand (machine_mode, rtx);
+  void add_mem_operand (machine_mode, unsigned);
  
    machine_mode vector_mode (void) const;
  
diff --git a/gcc/config/riscv/vector.md b/gcc/config/riscv/vector.md

index f2b18c1..1ec0a4d 100644 (file)
--- a/gcc/config/riscv/vector.md
+++ b/gcc/config/riscv/vector.md
@@ -209,7 +209,7 @@
  
  ;; The index of operand[] to get the merge op.
  (define_attr "merge_op_idx" ""
-       (cond [(eq_attr "type" "vlde,vste,vimov,vfmov,vldm,vstm,vlds,vmalu")
+       (cond [(eq_attr "type" "vlde,vimov,vfmov,vldm,vstm,vlds,vmalu")
          (const_int 2)]
         (const_int INVALID_ATTRIBUTE)))
  
@@ -667,7 +667,7 @@
              (reg:SI VL_REGNUM)
              (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
           (match_operand:V 3 "vector_move_operand"       "    m,     m,    vr,    vr, viWc0")
-         (match_operand:V 2 "vector_merge_operand"      "    0,    vu,   vu0,   vu0,   vu0")))]
+         (match_operand:V 2 "vector_merge_operand"      "    0,    vu,    vu,   vu0,   vu0")))]
    "TARGET_VECTOR"
    "@
     vle<sew>.v\t%0,%3%p1
@@ -683,6 +683,25 @@
    [(set_attr "type" "vlde,vlde,vste,vimov,vimov")
     (set_attr "mode" "<MODE>")])
  
+;; Dedicated pattern for vse.v instruction since we can't reuse pred_mov pattern to include
+;; memory operand as input which will produce inferior codegen.
+(define_insn "@pred_store<mode>"
+  [(set (match_operand:V 0 "memory_operand"                 "+m")
+       (if_then_else:V
+         (unspec:<VM>
+           [(match_operand:<VM> 1 "vector_mask_operand" "vmWc1")
+            (match_operand 3 "vector_length_operand"    "   rK")
+            (reg:SI VL_REGNUM)
+            (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+         (match_operand:V 2 "register_operand"         "    vr")
+         (match_dup 0)))]
+  "TARGET_VECTOR"
+  "vse<sew>.v\t%2,%0%p1"
+  [(set_attr "type" "vste")
+   (set_attr "mode" "<MODE>")
+   (set (attr "avl_type") (symbol_ref "riscv_vector::NONVLMAX"))
+   (set_attr "vl_op_idx" "3")])
+
  ;; vlm.v/vsm.v/vmclr.m/vmset.m.
  ;; constraint alternative 0 match vlm.v.
  ;; constraint alternative 1 match vsm.v.
diff --git a/gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c b/gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c

new file mode 100644 (file)

index 0000000..5b8b9b4
--- /dev/null
+++ b/gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c
@@ -0,0 +1,97 @@
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3" } */
+
+#include "riscv_vector.h"
+
+void f (int * restrict in, int * restrict out, void * restrict mask_in, int n)
+{
+  vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
+  __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
+  vbool64_t mask = *(vbool64_t*)mask_in;
+  for (int i = 0; i < n; i++)
+    {
+      vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1), 19);
+      __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);
+
+      vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2), 19);
+      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);
+
+      vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t *)(in + i + 200), 13);
+      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 200), v3, 13);
+
+      vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i + 300), 11);
+      __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);
+
+      vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double *)(in + i + 500), 11);
+      __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);
+
+      vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in + i + 600), 11);
+      __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);
+
+      vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700), 11);
+      __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
+    }
+}
+
+void f2 (int * restrict in, int * restrict out, void * restrict mask_in, int n)
+{
+  vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
+  __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
+  vbool64_t mask = *(vbool64_t*)mask_in;
+  for (int i = 0; i < n; i++)
+    {
+      vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1), 19);
+      __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);
+
+      vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2), 19);
+      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);
+
+      vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t *)(in + i + 200), 13);
+      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 200), v2, 13);
+
+      vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i + 300), 11);
+      __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);
+
+      vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double *)(in + i + 500), 11);
+      __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);
+
+      vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in + i + 600), 11);
+      __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);
+
+      vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700), 11);
+      __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
+    }
+}
+
+void f3 (int * restrict in, int * restrict out, void * restrict mask_in, int n)
+{
+  vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
+  __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
+  vbool64_t mask = *(vbool64_t*)mask_in;
+  for (int i = 0; i < n; i++)
+    {
+      vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1), 19);
+      __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);
+
+      vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2), 19);
+      __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);
+
+      vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t *)(in + i + 200), 13);
+      *(vint32mf2_t*)(out + i + 200) = v3;
+
+      vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i + 300), 11);
+      __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);
+
+      vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double *)(in + i + 500), 11);
+      __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);
+
+      vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in + i + 600), 11);
+      __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);
+
+      vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700), 11);
+      __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
+    }
+}
+
+/* It should not have redundant vector register spills which produce csrr vlenb instructions allocate stack.  */
+/* { dg-final { scan-assembler-not {csrr\s+[a-x0-9]+,\s*vlenb} } } */
author	Ju-Zhe Zhong <juzhe.zhong@rivai.ai>
	Thu, 29 Dec 2022 15:34:02 +0000 (23:34 +0800)
committer	Kito Cheng <kito.cheng@sifive.com>
	Thu, 26 Jan 2023 18:52:40 +0000 (02:52 +0800)
gcc/config/riscv/riscv-vector-builtins-bases.cc		patch \| blob \| history
gcc/config/riscv/riscv-vector-builtins.cc		patch \| blob \| history
gcc/config/riscv/riscv-vector-builtins.h		patch \| blob \| history
gcc/config/riscv/vector.md		patch \| blob \| history
gcc/testsuite/gcc.target/riscv/rvv/base/vse-constraint-1.c	[new file with mode: 0644]	patch \| blob