}
/* Add a memory operand with mode MODE and address ADDR. */
-rtx
-function_expander::add_mem_operand (machine_mode mode, rtx addr)
+void
+function_expander::add_mem_operand (machine_mode mode, unsigned argno)
{
gcc_assert (VECTOR_MODE_P (mode));
+ rtx addr = expand_normal (CALL_EXPR_ARG (exp, argno));
rtx mem = gen_rtx_MEM (mode, memory_address (mode, addr));
/* The memory is only guaranteed to be element-aligned. */
set_mem_align (mem, GET_MODE_ALIGNMENT (GET_MODE_INNER (mode)));
add_fixed_operand (mem);
- return mem;
}
/* Use contiguous load INSN. */
else
add_vundef_operand (mode);
- tree addr_arg = CALL_EXPR_ARG (exp, arg_offset++);
- rtx addr = expand_normal (addr_arg);
- add_mem_operand (mode, addr);
+ add_mem_operand (mode, arg_offset++);
for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++)
add_input_operand (argno);
/* Record the offset to get the argument. */
int arg_offset = 0;
- int addr_loc = use_real_mask_p (pred) ? 1 : 0;
- tree addr_arg = CALL_EXPR_ARG (exp, addr_loc);
- rtx addr = expand_normal (addr_arg);
- rtx mem = add_mem_operand (mode, addr);
+ add_mem_operand (mode, use_real_mask_p (pred) ? 1 : 0);
if (use_real_mask_p (pred))
add_input_operand (arg_offset++);
else
add_all_one_mask_operand (mask_mode);
- /* To model "+m" constraint, we include memory operand into input. */
- add_input_operand (mode, mem);
-
arg_offset++;
for (int argno = arg_offset; argno < call_expr_nargs (exp); argno++)
add_input_operand (argno);
- add_input_operand (Pmode, get_tail_policy_for_pred (pred));
- add_input_operand (Pmode, get_mask_policy_for_pred (pred));
- add_input_operand (Pmode, get_avl_type_rtx (avl_type::NONVLMAX));
-
return generate_insn (icode);
}
;; The index of operand[] to get the merge op.
(define_attr "merge_op_idx" ""
- (cond [(eq_attr "type" "vlde,vste,vimov,vfmov,vldm,vstm,vlds,vmalu")
+ (cond [(eq_attr "type" "vlde,vimov,vfmov,vldm,vstm,vlds,vmalu")
(const_int 2)]
(const_int INVALID_ATTRIBUTE)))
(reg:SI VL_REGNUM)
(reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
(match_operand:V 3 "vector_move_operand" " m, m, vr, vr, viWc0")
- (match_operand:V 2 "vector_merge_operand" " 0, vu, vu0, vu0, vu0")))]
+ (match_operand:V 2 "vector_merge_operand" " 0, vu, vu, vu0, vu0")))]
"TARGET_VECTOR"
"@
vle<sew>.v\t%0,%3%p1
[(set_attr "type" "vlde,vlde,vste,vimov,vimov")
(set_attr "mode" "<MODE>")])
+;; Dedicated pattern for vse.v instruction since we can't reuse pred_mov pattern to include
+;; memory operand as input which will produce inferior codegen.
+(define_insn "@pred_store<mode>"
+ [(set (match_operand:V 0 "memory_operand" "+m")
+ (if_then_else:V
+ (unspec:<VM>
+ [(match_operand:<VM> 1 "vector_mask_operand" "vmWc1")
+ (match_operand 3 "vector_length_operand" " rK")
+ (reg:SI VL_REGNUM)
+ (reg:SI VTYPE_REGNUM)] UNSPEC_VPREDICATE)
+ (match_operand:V 2 "register_operand" " vr")
+ (match_dup 0)))]
+ "TARGET_VECTOR"
+ "vse<sew>.v\t%2,%0%p1"
+ [(set_attr "type" "vste")
+ (set_attr "mode" "<MODE>")
+ (set (attr "avl_type") (symbol_ref "riscv_vector::NONVLMAX"))
+ (set_attr "vl_op_idx" "3")])
+
;; vlm.v/vsm.v/vmclr.m/vmset.m.
;; constraint alternative 0 match vlm.v.
;; constraint alternative 1 match vsm.v.
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-march=rv32gcv -mabi=ilp32d -O3" } */
+
+#include "riscv_vector.h"
+
+void f (int * restrict in, int * restrict out, void * restrict mask_in, int n)
+{
+ vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
+ __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
+ vbool64_t mask = *(vbool64_t*)mask_in;
+ for (int i = 0; i < n; i++)
+ {
+ vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1), 19);
+ __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);
+
+ vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2), 19);
+ __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);
+
+ vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t *)(in + i + 200), 13);
+ __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 200), v3, 13);
+
+ vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i + 300), 11);
+ __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);
+
+ vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double *)(in + i + 500), 11);
+ __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);
+
+ vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in + i + 600), 11);
+ __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);
+
+ vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700), 11);
+ __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
+ }
+}
+
+void f2 (int * restrict in, int * restrict out, void * restrict mask_in, int n)
+{
+ vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
+ __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
+ vbool64_t mask = *(vbool64_t*)mask_in;
+ for (int i = 0; i < n; i++)
+ {
+ vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1), 19);
+ __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);
+
+ vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2), 19);
+ __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);
+
+ vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t *)(in + i + 200), 13);
+ __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 200), v2, 13);
+
+ vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i + 300), 11);
+ __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);
+
+ vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double *)(in + i + 500), 11);
+ __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);
+
+ vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in + i + 600), 11);
+ __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);
+
+ vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700), 11);
+ __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
+ }
+}
+
+void f3 (int * restrict in, int * restrict out, void * restrict mask_in, int n)
+{
+ vfloat32mf2_t v = __riscv_vle32_v_f32mf2 ((float *)(in + 10000), 19);
+ __riscv_vse32_v_f32mf2 ((float *)(out + 10000), v, 19);
+ vbool64_t mask = *(vbool64_t*)mask_in;
+ for (int i = 0; i < n; i++)
+ {
+ vint16mf2_t v1 = __riscv_vle16_v_i16mf2 ((int16_t *)(in + i + 1), 19);
+ __riscv_vse16_v_i16mf2 ((int16_t *)(out + i + 1), v1, 19);
+
+ vint32mf2_t v2 = __riscv_vle32_v_i32mf2 ((int32_t *)(in + i + 2), 19);
+ __riscv_vse32_v_i32mf2 ((int32_t *)(out + i + 2), v2, 19);
+
+ vint32mf2_t v3 = __riscv_vle32_v_i32mf2_tumu (mask, v2, (int32_t *)(in + i + 200), 13);
+ *(vint32mf2_t*)(out + i + 200) = v3;
+
+ vfloat64m1_t v4 = __riscv_vle64_v_f64m1_m (mask, (double *)(in + i + 300), 11);
+ __riscv_vse64_v_f64m1 ((double *)(out + i + 300), v4, 11);
+
+ vfloat64m1_t v5 = __riscv_vle64_v_f64m1_tum (mask, v4, (double *)(in + i + 500), 11);
+ __riscv_vse64_v_f64m1 ((double *)(out + i + 500), v5, 11);
+
+ vfloat64m1_t v6 = __riscv_vle64_v_f64m1_mu (mask, v5, (double *)(in + i + 600), 11);
+ __riscv_vse64_v_f64m1_m (mask, (double *)(out + i + 600), v6, 11);
+
+ vuint8mf4_t v7 = __riscv_vle8_v_u8mf4 ((uint8_t *)(in + i + 700), 11);
+ __riscv_vse8_v_u8mf4 ((uint8_t *)(out + i + 700), v7, 11);
+ }
+}
+
+/* It should not have redundant vector register spills which produce csrr vlenb instructions allocate stack. */
+/* { dg-final { scan-assembler-not {csrr\s+[a-x0-9]+,\s*vlenb} } } */