From d459de85b75842135372191af4d9dab2d75c65b3 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Mon, 31 Jul 2023 17:19:56 -0400 Subject: [PATCH] agx: Optimize swaps of 2x16 channels We can use extr to swap the low and high halves of a 32-bit register in one instruction. No shader-db changes, but it reduces xor's on a deqp I'm looking at. Yes, I'm procrastinating on debugging deqps, how'd you guess? Signed-off-by: Alyssa Rosenzweig Part-of: --- src/asahi/compiler/agx_lower_parallel_copy.c | 17 +++++++++++++++++ src/asahi/compiler/test/test-lower-parallel-copy.cpp | 14 +++++++++----- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/src/asahi/compiler/agx_lower_parallel_copy.c b/src/asahi/compiler/agx_lower_parallel_copy.c index 586b67e..8db6f8b 100644 --- a/src/asahi/compiler/agx_lower_parallel_copy.c +++ b/src/asahi/compiler/agx_lower_parallel_copy.c @@ -36,6 +36,23 @@ do_swap(agx_builder *b, const struct agx_copy *copy) if (copy->dest == copy->src.value) return; + /* We can swap lo/hi halves of a 32-bit register with a 32-bit extr */ + if (copy->src.size == AGX_SIZE_16 && + (copy->dest >> 1) == (copy->src.value >> 1)) { + + assert(((copy->dest & 1) == (1 - (copy->src.value & 1))) && + "no trivial swaps, and only 2 halves of a register"); + + /* r0 = extr r0, r0, #16 + * = (((r0 << 32) | r0) >> 16) & 0xFFFFFFFF + * = (((r0 << 32) >> 16) & 0xFFFFFFFF) | (r0 >> 16) + * = (r0l << 16) | r0h + */ + agx_index reg32 = agx_register(copy->dest & ~1, AGX_SIZE_32); + agx_extr_to(b, reg32, reg32, reg32, agx_immediate(16), 0); + return; + } + agx_index x = agx_register(copy->dest, copy->src.size); agx_index y = copy->src; diff --git a/src/asahi/compiler/test/test-lower-parallel-copy.cpp b/src/asahi/compiler/test/test-lower-parallel-copy.cpp index ba2c071..0dbb347 100644 --- a/src/asahi/compiler/test/test-lower-parallel-copy.cpp +++ b/src/asahi/compiler/test/test-lower-parallel-copy.cpp @@ -23,6 +23,13 @@ } while (0) static inline void +extr_swap(agx_builder *b, agx_index x) +{ + x.size = AGX_SIZE_32; + agx_extr_to(b, x, x, x, agx_immediate(16), 0); +} + +static inline void xor_swap(agx_builder *b, agx_index x, agx_index y) { agx_xor_to(b, x, x, y); @@ -161,9 +168,7 @@ TEST_F(LowerParallelCopy, Swap) {.dest = 1, .src = agx_register(0, AGX_SIZE_16)}, }; - CASE(test_2, { - xor_swap(b, agx_register(0, AGX_SIZE_16), agx_register(1, AGX_SIZE_16)); - }); + CASE(test_2, { extr_swap(b, agx_register(0, AGX_SIZE_16)); }); } TEST_F(LowerParallelCopy, Cycle3) @@ -174,9 +179,8 @@ TEST_F(LowerParallelCopy, Cycle3) {.dest = 2, .src = agx_register(0, AGX_SIZE_16)}, }; - /* XXX: requires 6 instructions. if we had a temp free, could do it in 4 */ CASE(test, { - xor_swap(b, agx_register(0, AGX_SIZE_16), agx_register(1, AGX_SIZE_16)); + extr_swap(b, agx_register(0, AGX_SIZE_16)); xor_swap(b, agx_register(1, AGX_SIZE_16), agx_register(2, AGX_SIZE_16)); }); } -- 2.7.4