From 1275981df8c3762c208a49c8cde7deaa15489b24 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Daniel=20Sch=C3=BCrmann?= <daniel@schuermann.dev>
Date: Mon, 4 Sep 2023 15:32:24 +0200
Subject: [PATCH] aco: don't optimize cross-lane instructions across p_wqm

We will use p_wqm as a marker in the next step.

Totals from 8846 (11.55% of 76572) affected shaders: (GFX11)

Instrs: 7031274 -> 7072729 (+0.59%); split: -0.02%, +0.60%
CodeSize: 37060272 -> 37355244 (+0.80%); split: -0.01%, +0.80%
VGPRs: 402660 -> 398724 (-0.98%); split: -0.99%, +0.01%
Latency: 62231926 -> 62322311 (+0.15%); split: -0.01%, +0.15%
InvThroughput: 10341361 -> 10392589 (+0.50%); split: -0.00%, +0.50%
VClause: 105344 -> 105368 (+0.02%); split: -0.03%, +0.05%
SClause: 218330 -> 218469 (+0.06%); split: -0.07%, +0.14%
Copies: 378609 -> 377644 (-0.25%); split: -0.42%, +0.17%
Branches: 97218 -> 97207 (-0.01%); split: -0.01%, +0.00%
PreSGPRs: 307654 -> 307644 (-0.00%); split: -0.08%, +0.08%
PreVGPRs: 314744 -> 308650 (-1.94%)
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25038>
---
 src/amd/compiler/aco_opt_value_numbering.cpp |  2 +-
 src/amd/compiler/tests/test_d3d11_derivs.cpp | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/amd/compiler/aco_opt_value_numbering.cpp b/src/amd/compiler/aco_opt_value_numbering.cpp
index f7619d3..d73adc4 100644
--- a/src/amd/compiler/aco_opt_value_numbering.cpp
+++ b/src/amd/compiler/aco_opt_value_numbering.cpp
@@ -382,7 +382,7 @@ process_block(vn_ctx& ctx, Block& block)
       }
 
       if (instr->opcode == aco_opcode::p_discard_if ||
-          instr->opcode == aco_opcode::p_demote_to_helper)
+          instr->opcode == aco_opcode::p_demote_to_helper || instr->opcode == aco_opcode::p_wqm)
          ctx.exec_id++;
 
       if (!can_eliminate(instr)) {
diff --git a/src/amd/compiler/tests/test_d3d11_derivs.cpp b/src/amd/compiler/tests/test_d3d11_derivs.cpp
index ee0299e..c0742e0 100644
--- a/src/amd/compiler/tests/test_d3d11_derivs.cpp
+++ b/src/amd/compiler/tests/test_d3d11_derivs.cpp
@@ -599,11 +599,13 @@ BEGIN_TEST(d3d11_derivs.get_lod)
    //>> v2: %vec = p_create_vector %x, %y
    //>> lv2: %wqm = p_start_linear_vgpr (kill)%vec
    //>> v1: %x0 = v_mov_b32 %x quad_perm:[0,0,0,0] bound_ctrl:1
-   //>> v1: %x1_m_x0 = v_sub_f32 %x, %x0 quad_perm:[1,1,1,1] bound_ctrl:1
-   //>> v1: %x2_m_x0 = v_sub_f32 (kill)%x, (kill)%x0 quad_perm:[2,2,2,2] bound_ctrl:1
+   //>> v1: %x1_m_x0 = v_sub_f32 %x, (kill)%x0 quad_perm:[1,1,1,1] bound_ctrl:1
+   //>> v1: %x1 = v_mov_b32 %x quad_perm:[0,0,0,0] bound_ctrl:1
+   //>> v1: %x2_m_x0 = v_sub_f32 (kill)%x, (kill)%x1 quad_perm:[2,2,2,2] bound_ctrl:1
    //>> v1: %y0 = v_mov_b32 %y quad_perm:[0,0,0,0] bound_ctrl:1
-   //>> v1: %y1_m_y0 = v_sub_f32 %y, %y0 quad_perm:[1,1,1,1] bound_ctrl:1
-   //>> v1: %y2_m_y0 = v_sub_f32 (kill)%y, (kill)%y0 quad_perm:[2,2,2,2] bound_ctrl:1
+   //>> v1: %y1_m_y0 = v_sub_f32 %y, (kill)%y0 quad_perm:[1,1,1,1] bound_ctrl:1
+   //>> v1: %y1 = v_mov_b32 %y quad_perm:[0,0,0,0] bound_ctrl:1
+   //>> v1: %y2_m_y0 = v_sub_f32 (kill)%y, (kill)%y1 quad_perm:[2,2,2,2] bound_ctrl:1
    //>> BB1
    //>> v2: %_ = image_get_lod (kill)%_, (kill)%_, v1: undef, %wqm 2d
    //>> BB2
-- 
2.7.4