From 5fe3e6bf061da8d9b0e759927c340fe8e0f44725 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Fri, 8 Jan 2021 10:49:38 +0000 Subject: [PATCH] aarch64: Support unpacked CNOT on SVE This patch adds unpacked support for unconditional and conditional CNOT. The type suffix has to be taken from the element size rather than the container size. gcc/ * config/aarch64/aarch64-sve.md (*cnot): Extend from SVE_FULL_I to SVE_I. (*cond_cnot_2, *cond_cnot_any): Likewise. gcc/testsuite/ * gcc.target/aarch64/sve/cnot_2.c: New test. * gcc.target/aarch64/sve/cond_cnot_4.c: Likewise. * gcc.target/aarch64/sve/cond_cnot_4_run.c: Likewise. * gcc.target/aarch64/sve/cond_cnot_5.c: Likewise. * gcc.target/aarch64/sve/cond_cnot_5_run.c: Likewise. * gcc.target/aarch64/sve/cond_cnot_6.c: Likewise. * gcc.target/aarch64/sve/cond_cnot_6_run.c: Likewise. --- gcc/config/aarch64/aarch64-sve.md | 36 +++++++++++----------- gcc/testsuite/gcc.target/aarch64/sve/cnot_2.c | 29 +++++++++++++++++ gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4.c | 32 +++++++++++++++++++ .../gcc.target/aarch64/sve/cond_cnot_4_run.c | 26 ++++++++++++++++ gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_5.c | 32 +++++++++++++++++++ .../gcc.target/aarch64/sve/cond_cnot_5_run.c | 26 ++++++++++++++++ gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_6.c | 31 +++++++++++++++++++ .../gcc.target/aarch64/sve/cond_cnot_6_run.c | 26 ++++++++++++++++ 8 files changed, 220 insertions(+), 18 deletions(-) create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/cnot_2.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4_run.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_5.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_5_run.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_6.c create mode 100644 gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_6_run.c diff --git a/gcc/config/aarch64/aarch64-sve.md b/gcc/config/aarch64/aarch64-sve.md index b83f991..2f5a5e3 100644 --- a/gcc/config/aarch64/aarch64-sve.md +++ b/gcc/config/aarch64/aarch64-sve.md @@ -3227,16 +3227,16 @@ ) (define_insn "*cnot" - [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w") - (unspec:SVE_FULL_I + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") + (unspec:SVE_I [(unspec: [(match_operand: 1 "register_operand" "Upl, Upl") (match_operand:SI 5 "aarch64_sve_ptrue_flag") (eq: - (match_operand:SVE_FULL_I 2 "register_operand" "0, w") - (match_operand:SVE_FULL_I 3 "aarch64_simd_imm_zero"))] + (match_operand:SVE_I 2 "register_operand" "0, w") + (match_operand:SVE_I 3 "aarch64_simd_imm_zero"))] UNSPEC_PRED_Z) - (match_operand:SVE_FULL_I 4 "aarch64_simd_imm_one") + (match_operand:SVE_I 4 "aarch64_simd_imm_one") (match_dup 3)] UNSPEC_SEL))] "TARGET_SVE" @@ -3274,19 +3274,19 @@ ;; Predicated logical inverse, merging with the first input. (define_insn_and_rewrite "*cond_cnot_2" - [(set (match_operand:SVE_FULL_I 0 "register_operand" "=w, ?&w") - (unspec:SVE_FULL_I + [(set (match_operand:SVE_I 0 "register_operand" "=w, ?&w") + (unspec:SVE_I [(match_operand: 1 "register_operand" "Upl, Upl") ;; Logical inverse of operand 2 (as above). - (unspec:SVE_FULL_I + (unspec:SVE_I [(unspec: [(match_operand 5) (const_int SVE_KNOWN_PTRUE) (eq: - (match_operand:SVE_FULL_I 2 "register_operand" "0, w") - (match_operand:SVE_FULL_I 3 "aarch64_simd_imm_zero"))] + (match_operand:SVE_I 2 "register_operand" "0, w") + (match_operand:SVE_I 3 "aarch64_simd_imm_zero"))] UNSPEC_PRED_Z) - (match_operand:SVE_FULL_I 4 "aarch64_simd_imm_one") + (match_operand:SVE_I 4 "aarch64_simd_imm_one") (match_dup 3)] UNSPEC_SEL) (match_dup 2)] @@ -3310,22 +3310,22 @@ ;; as earlyclobber helps to make the instruction more regular to the ;; register allocator. (define_insn_and_rewrite "*cond_cnot_any" - [(set (match_operand:SVE_FULL_I 0 "register_operand" "=&w, ?&w, ?&w") - (unspec:SVE_FULL_I + [(set (match_operand:SVE_I 0 "register_operand" "=&w, ?&w, ?&w") + (unspec:SVE_I [(match_operand: 1 "register_operand" "Upl, Upl, Upl") ;; Logical inverse of operand 2 (as above). - (unspec:SVE_FULL_I + (unspec:SVE_I [(unspec: [(match_operand 5) (const_int SVE_KNOWN_PTRUE) (eq: - (match_operand:SVE_FULL_I 2 "register_operand" "w, w, w") - (match_operand:SVE_FULL_I 3 "aarch64_simd_imm_zero"))] + (match_operand:SVE_I 2 "register_operand" "w, w, w") + (match_operand:SVE_I 3 "aarch64_simd_imm_zero"))] UNSPEC_PRED_Z) - (match_operand:SVE_FULL_I 4 "aarch64_simd_imm_one") + (match_operand:SVE_I 4 "aarch64_simd_imm_one") (match_dup 3)] UNSPEC_SEL) - (match_operand:SVE_FULL_I 6 "aarch64_simd_reg_or_zero" "0, Dz, w")] + (match_operand:SVE_I 6 "aarch64_simd_reg_or_zero" "0, Dz, w")] UNSPEC_SEL))] "TARGET_SVE && !rtx_equal_p (operands[2], operands[6])" "@ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cnot_2.c b/gcc/testsuite/gcc.target/aarch64/sve/cnot_2.c new file mode 100644 index 0000000..fe77823 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/cnot_2.c @@ -0,0 +1,29 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#include + +#define DEF_LOOP(TYPE1, TYPE2, COUNT) \ + void __attribute__ ((noipa)) \ + test_##TYPE1##_##TYPE2##_##TYPE3 (TYPE2 *restrict r, \ + TYPE1 *restrict pred, \ + TYPE2 *restrict a) \ + { \ + for (int i = 0; i < COUNT; ++i) \ + if (pred[i]) \ + r[i] = !a[i]; \ + } + +#define TEST_ALL(T) \ + T (int16_t, int8_t, 7) \ + T (int32_t, int8_t, 3) \ + T (int32_t, int16_t, 3) \ + T (int64_t, int8_t, 5) \ + T (int64_t, int16_t, 5) \ + T (int64_t, int32_t, 5) + +TEST_ALL (DEF_LOOP) + +/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.b, p[0-7]/m,} 3 } } */ +/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.h, p[0-7]/m,} 2 } } */ +/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.s, p[0-7]/m,} 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4.c new file mode 100644 index 0000000..729d3f4 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4.c @@ -0,0 +1,32 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#include + +#define DEF_LOOP(TYPE1, TYPE2, COUNT) \ + void __attribute__ ((noipa)) \ + test_##TYPE1##_##TYPE2 (TYPE2 *__restrict r, \ + TYPE2 *__restrict a, \ + TYPE1 *__restrict pred) \ + { \ + for (int i = 0; i < COUNT; ++i) \ + r[i] = pred[i] ? !a[i] : a[i]; \ + } + +#define TEST_ALL(T) \ + T (int16_t, int8_t, 7) \ + T (int32_t, int8_t, 3) \ + T (int32_t, int16_t, 3) \ + T (int64_t, int8_t, 5) \ + T (int64_t, int16_t, 5) \ + T (int64_t, int32_t, 5) + +TEST_ALL (DEF_LOOP) + +/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.b, p[0-7]/m,} 3 } } */ +/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.h, p[0-7]/m,} 2 } } */ +/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.s, p[0-7]/m,} 1 } } */ + +/* { dg-final { scan-assembler-not {\tmov\tz} } } */ +/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */ +/* { dg-final { scan-assembler-not {\tsel\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4_run.c new file mode 100644 index 0000000..de9c0a5 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_4_run.c @@ -0,0 +1,26 @@ +/* { dg-do run { target { aarch64_sve_hw } } } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#include "cond_cnot_4.c" + +#define TEST_LOOP(TYPE1, TYPE2, N) \ + { \ + TYPE1 pred[N]; \ + TYPE2 r[N], a[N]; \ + for (int i = 0; i < N; ++i) \ + { \ + a[i] = i & 1 ? 0 : 3 * (i + 1); \ + pred[i] = (i % 3 < 2); \ + asm volatile ("" ::: "memory"); \ + } \ + test_##TYPE1##_##TYPE2 (r, a, pred); \ + for (int i = 0; i < N; ++i) \ + if (r[i] != (TYPE2) (pred[i] ? !a[i] : a[i])) \ + __builtin_abort (); \ + } + +int main () +{ + TEST_ALL (TEST_LOOP) + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_5.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_5.c new file mode 100644 index 0000000..7318e10 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_5.c @@ -0,0 +1,32 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#include + +#define DEF_LOOP(TYPE1, TYPE2, COUNT) \ + void __attribute__ ((noipa)) \ + test_##TYPE1##_##TYPE2 (TYPE2 *__restrict r, \ + TYPE1 *__restrict a, \ + TYPE2 *__restrict b) \ + { \ + for (int i = 0; i < COUNT; ++i) \ + r[i] = a[i] == 0 ? !b[i] : a[i]; \ + } + +#define TEST_ALL(T) \ + T (int16_t, int8_t, 7) \ + T (int32_t, int8_t, 3) \ + T (int32_t, int16_t, 3) \ + T (int64_t, int8_t, 5) \ + T (int64_t, int16_t, 5) \ + T (int64_t, int32_t, 5) + +TEST_ALL (DEF_LOOP) + +/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.b, p[0-7]/m,} 3 } } */ +/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.h, p[0-7]/m,} 2 } } */ +/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.s, p[0-7]/m,} 1 } } */ + +/* { dg-final { scan-assembler-not {\tmov\tz} } } */ +/* { dg-final { scan-assembler-not {\tmovprfx\t} } } */ +/* { dg-final { scan-assembler-not {\tsel\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_5_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_5_run.c new file mode 100644 index 0000000..f8f277c --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_5_run.c @@ -0,0 +1,26 @@ +/* { dg-do run { target { aarch64_sve_hw } } } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#include "cond_cnot_5.c" + +#define TEST_LOOP(TYPE1, TYPE2, N) \ + { \ + TYPE1 a[N]; \ + TYPE2 r[N], b[N]; \ + for (int i = 0; i < N; ++i) \ + { \ + a[i] = i % 3 < 2 ? 0 : i * 42; \ + b[i] = i & 1 ? 0 : 3 * (i + 1); \ + asm volatile ("" ::: "memory"); \ + } \ + test_##TYPE1##_##TYPE2 (r, a, b); \ + for (int i = 0; i < N; ++i) \ + if (r[i] != (TYPE2) (a[i] == 0 ? !b[i] : a[i])) \ + __builtin_abort (); \ + } + +int main () +{ + TEST_ALL (TEST_LOOP) + return 0; +} diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_6.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_6.c new file mode 100644 index 0000000..d44e357 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_6.c @@ -0,0 +1,31 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#include + +#define DEF_LOOP(TYPE1, TYPE2, COUNT) \ + void __attribute__ ((noipa)) \ + test_##TYPE1##_##TYPE2 (TYPE2 *__restrict r, \ + TYPE1 *__restrict a, \ + TYPE2 *__restrict b) \ + { \ + for (int i = 0; i < COUNT; ++i) \ + r[i] = a[i] == 0 ? !b[i] : 127; \ + } + +#define TEST_ALL(T) \ + T (int16_t, int8_t, 7) \ + T (int32_t, int8_t, 3) \ + T (int32_t, int16_t, 3) \ + T (int64_t, int8_t, 5) \ + T (int64_t, int16_t, 5) \ + T (int64_t, int32_t, 5) + +TEST_ALL (DEF_LOOP) + +/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.b, p[0-7]/m,} 3 } } */ +/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.h, p[0-7]/m,} 2 } } */ +/* { dg-final { scan-assembler-times {\tcnot\tz[0-9]+\.s, p[0-7]/m,} 1 } } */ + +/* { dg-final { scan-assembler-not {\tmov\tz[^\n]*z} } } */ +/* { dg-final { scan-assembler-not {\tsel\t} } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_6_run.c b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_6_run.c new file mode 100644 index 0000000..9e33616 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/sve/cond_cnot_6_run.c @@ -0,0 +1,26 @@ +/* { dg-do run { target { aarch64_sve_hw } } } */ +/* { dg-options "-O2 -ftree-vectorize" } */ + +#include "cond_cnot_6.c" + +#define TEST_LOOP(TYPE1, TYPE2, N) \ + { \ + TYPE1 a[N]; \ + TYPE2 r[N], b[N]; \ + for (int i = 0; i < N; ++i) \ + { \ + a[i] = i % 3 < 2 ? 0 : i * 42; \ + b[i] = i & 1 ? 0 : 3 * (i + 1); \ + asm volatile ("" ::: "memory"); \ + } \ + test_##TYPE1##_##TYPE2 (r, a, b); \ + for (int i = 0; i < N; ++i) \ + if (r[i] != (TYPE2) (a[i] == 0 ? !b[i] : 127)) \ + __builtin_abort (); \ + } + +int main () +{ + TEST_ALL (TEST_LOOP) + return 0; +} -- 2.7.4