From fd6605367d00762e6f63dd6fc85b504ee0c1667a Mon Sep 17 00:00:00 2001
From: =?utf8?q?Timur=20Krist=C3=B3f?= <timur.kristof@gmail.com>
Date: Fri, 28 May 2021 21:56:13 +0200
Subject: [PATCH] aco: Implement nir_op_sad_u8x4.
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Fix up the operand size for v_sad instructions, and implement
the new NIR horizontal add. There is no viable way to do this
in SALU, so let's always use a VGPR destination.

Signed-off-by: Timur KristÃ³f <timur.kristof@gmail.com>
Reviewed-by: Tony Wasserka <tony.wasserka@gmx.de>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11072>
---
 src/amd/compiler/aco_instruction_selection.cpp       | 5 +++++
 src/amd/compiler/aco_instruction_selection_setup.cpp | 1 +
 src/amd/compiler/aco_opcodes.py                      | 8 ++++++--
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index b985b5a..40a1687 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -3031,6 +3031,11 @@ void visit_alu_instr(isel_context *ctx, nir_alu_instr *instr)
       }
       break;
    }
+   case nir_op_sad_u8x4: {
+      assert(dst.regClass() == v1);
+      emit_vop3a_instruction(ctx, instr, aco_opcode::v_sad_u8, dst, false, 3u, false);
+      break;
+   }
    case nir_op_fquantize2f16: {
       Temp src = get_alu_src(ctx, instr->src[0]);
       Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src);
diff --git a/src/amd/compiler/aco_instruction_selection_setup.cpp b/src/amd/compiler/aco_instruction_selection_setup.cpp
index 15f9ce3..f7cebe0 100644
--- a/src/amd/compiler/aco_instruction_selection_setup.cpp
+++ b/src/amd/compiler/aco_instruction_selection_setup.cpp
@@ -663,6 +663,7 @@ void init_context(isel_context *ctx, nir_shader *shader)
                   case nir_op_frexp_exp:
                   case nir_op_cube_face_index:
                   case nir_op_cube_face_coord:
+                  case nir_op_sad_u8x4:
                      type = RegType::vgpr;
                      break;
                   case nir_op_f2i16:
diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py
index 7a1099d..5267fb1 100644
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@@ -238,8 +238,10 @@ class Opcode(object):
       self.definition_size = def_dtype_sizes.get(def_dtype, self.operand_size)
 
       # exceptions for operands:
-      if 'sad_' in name:
+      if 'qsad_' in name:
         self.operand_size = 0
+      elif 'sad_' in name:
+        self.operand_size = 32
       elif name in ['v_mad_u64_u32', 'v_mad_i64_i32']:
         self.operand_size = 0
       elif self.operand_size == 24:
@@ -251,8 +253,10 @@ class Opcode(object):
         self.operand_size = 32
 
       # exceptions for definitions:
-      if 'sad_' in name:
+      if 'qsad_' in name:
         self.definition_size = 0
+      elif 'sad_' in name:
+        self.definition_size = 32
       elif '_pk' in name:
         self.definition_size = 32
 
-- 
2.7.4