From 31a0574b968b8dbb2b024fb332bcba87a02bef46 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Wed, 30 Sep 2020 10:48:29 +0200
Subject: [PATCH] ac/nir: implement nir_op_fsat

With fmed3 if available, otherwise fallback to fmin/fmax.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6932>
---
 src/amd/llvm/ac_llvm_build.c  | 44 +++++++++++++++++++++++++++++++++++++++++++
 src/amd/llvm/ac_llvm_build.h  |  3 +++
 src/amd/llvm/ac_nir_to_llvm.c |  5 +++++
 3 files changed, 52 insertions(+)

diff --git a/src/amd/llvm/ac_llvm_build.c b/src/amd/llvm/ac_llvm_build.c
index e37c3da..44ebb01 100644
--- a/src/amd/llvm/ac_llvm_build.c
+++ b/src/amd/llvm/ac_llvm_build.c
@@ -2447,6 +2447,50 @@ void ac_build_waitcnt(struct ac_llvm_context *ctx, unsigned wait_flags)
    ac_build_intrinsic(ctx, "llvm.amdgcn.s.waitcnt", ctx->voidt, args, 1, 0);
 }
 
+LLVMValueRef ac_build_fsat(struct ac_llvm_context *ctx, LLVMValueRef src,
+                           LLVMTypeRef type)
+{
+   unsigned bitsize = ac_get_elem_bits(ctx, type);
+   LLVMValueRef zero = LLVMConstReal(type, 0.0);
+   LLVMValueRef one = LLVMConstReal(type, 1.0);
+   LLVMValueRef result;
+
+   if (bitsize == 64 || (bitsize == 16 && ctx->chip_class <= GFX8)) {
+      /* Use fmin/fmax for 64-bit fsat or 16-bit on GFX6-GFX8 because LLVM
+       * doesn't expose an intrinsic.
+       */
+      result = ac_build_fmin(ctx, ac_build_fmax(ctx, src, zero), one);
+   } else {
+      LLVMTypeRef type;
+      char *intr;
+
+      if (bitsize == 16) {
+         intr = "llvm.amdgcn.fmed3.f16";
+         type = ctx->f16;
+      } else {
+         assert(bitsize == 32);
+         intr = "llvm.amdgcn.fmed3.f32";
+         type = ctx->f32;
+      }
+
+      LLVMValueRef params[] = {
+         zero,
+         one,
+         src,
+      };
+
+      result = ac_build_intrinsic(ctx, intr, type, params, 3,
+                                  AC_FUNC_ATTR_READNONE);
+   }
+
+   if (ctx->chip_class < GFX9 && bitsize == 32) {
+      /* Only pre-GFX9 chips do not flush denorms. */
+      result = ac_build_canonicalize(ctx, result, bitsize);
+   }
+
+   return result;
+}
+
 LLVMValueRef ac_build_fract(struct ac_llvm_context *ctx, LLVMValueRef src0, unsigned bitsize)
 {
    LLVMTypeRef type;
diff --git a/src/amd/llvm/ac_llvm_build.h b/src/amd/llvm/ac_llvm_build.h
index 2e08a99..8423c87 100644
--- a/src/amd/llvm/ac_llvm_build.h
+++ b/src/amd/llvm/ac_llvm_build.h
@@ -455,6 +455,9 @@ LLVMValueRef ac_build_isign(struct ac_llvm_context *ctx, LLVMValueRef src0);
 LLVMValueRef ac_build_fsign(struct ac_llvm_context *ctx, LLVMValueRef src);
 LLVMValueRef ac_build_bit_count(struct ac_llvm_context *ctx, LLVMValueRef src0);
 
+LLVMValueRef ac_build_fsat(struct ac_llvm_context *ctx, LLVMValueRef src,
+                           LLVMTypeRef type);
+
 LLVMValueRef ac_build_bitfield_reverse(struct ac_llvm_context *ctx, LLVMValueRef src0);
 
 void ac_optimize_vs_outputs(struct ac_llvm_context *ac, LLVMValueRef main_fn,
diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c
index f07960e..89e4938 100644
--- a/src/amd/llvm/ac_nir_to_llvm.c
+++ b/src/amd/llvm/ac_nir_to_llvm.c
@@ -743,6 +743,11 @@ static void visit_alu(struct ac_nir_context *ctx, const nir_alu_instr *instr)
          result = ac_build_canonicalize(&ctx->ac, result, instr->dest.dest.ssa.bit_size);
       }
       break;
+   case nir_op_fsat:
+      src[0] = ac_to_float(&ctx->ac, src[0]);
+      result = ac_build_fsat(&ctx->ac, src[0],
+                             ac_to_float_type(&ctx->ac, def_type));
+      break;
    case nir_op_iabs:
       result = emit_iabs(&ctx->ac, src[0]);
       break;
-- 
2.7.4