From c590a3eadfd67c4694dd4a7013ce957a51289a56 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Tue, 7 Mar 2023 20:41:55 -0800 Subject: [PATCH] intel/fs: Move packHalf2x16 handling to lower_pack() This mainly lets the software scoreboarding pass correctly mark the instructions, without needing to resort to fragile manual handling in the generator. We can also make small improvements. On Gfx 8LP-12.0, we no longer have the restrictions about DWord alignment, so we can simply write each half into its intended location, rather than writing it to the low DWord and then shifting it in place. Reviewed-by: Lionel Landwerlin Reviewed-by: Sagar Ghuge Part-of: --- src/intel/compiler/brw_fs.h | 5 --- src/intel/compiler/brw_fs_generator.cpp | 53 -------------------------------- src/intel/compiler/brw_fs_lower_pack.cpp | 36 ++++++++++++++++++++-- 3 files changed, 33 insertions(+), 61 deletions(-) diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 805c60a..8fb3c3b 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -637,11 +637,6 @@ private: void generate_halt(fs_inst *inst); - void generate_pack_half_2x16_split(fs_inst *inst, - struct brw_reg dst, - struct brw_reg x, - struct brw_reg y); - void generate_mov_indirect(fs_inst *inst, struct brw_reg dst, struct brw_reg reg, diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index 7a26bff..a99f98b 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -1643,55 +1643,6 @@ fs_generator::generate_set_sample_id(fs_inst *inst, } void -fs_generator::generate_pack_half_2x16_split(fs_inst *, - struct brw_reg dst, - struct brw_reg x, - struct brw_reg y) -{ - assert(devinfo->ver >= 7); - assert(dst.type == BRW_REGISTER_TYPE_UD); - assert(x.type == BRW_REGISTER_TYPE_F); - assert(y.type == BRW_REGISTER_TYPE_F); - - /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16: - * - * Because this instruction does not have a 16-bit floating-point type, - * the destination data type must be Word (W). - * - * The destination must be DWord-aligned and specify a horizontal stride - * (HorzStride) of 2. The 16-bit result is stored in the lower word of - * each destination channel and the upper word is not modified. - */ - const enum brw_reg_type t = devinfo->ver > 7 - ? BRW_REGISTER_TYPE_HF : BRW_REGISTER_TYPE_W; - struct brw_reg dst_w = spread(retype(dst, t), 2); - - if (y.file == IMM) { - const uint32_t hhhh0000 = _mesa_float_to_half(y.f) << 16; - - brw_MOV(p, dst, brw_imm_ud(hhhh0000)); - brw_set_default_swsb(p, tgl_swsb_regdist(1)); - } else { - /* Give each 32-bit channel of dst the form below, where "." means - * unchanged. - * 0x....hhhh - */ - brw_F32TO16(p, dst_w, y); - - /* Now the form: - * 0xhhhh0000 - */ - brw_set_default_swsb(p, tgl_swsb_regdist(1)); - brw_SHL(p, dst, dst, brw_imm_ud(16u)); - } - - /* And, finally the form of packHalf2x16's output: - * 0xhhhhllll - */ - brw_F32TO16(p, dst_w, x); -} - -void fs_generator::enable_debug(const char *shader_name) { debug_flag = true; @@ -2350,10 +2301,6 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, generate_set_sample_id(inst, dst, src[0], src[1]); break; - case FS_OPCODE_PACK_HALF_2x16_SPLIT: - generate_pack_half_2x16_split(inst, dst, src[0], src[1]); - break; - case SHADER_OPCODE_HALT_TARGET: /* This is the place where the final HALT needs to be inserted if * we've emitted any discards. If not, this will emit no code. diff --git a/src/intel/compiler/brw_fs_lower_pack.cpp b/src/intel/compiler/brw_fs_lower_pack.cpp index 0b0f941..3a60989 100644 --- a/src/intel/compiler/brw_fs_lower_pack.cpp +++ b/src/intel/compiler/brw_fs_lower_pack.cpp @@ -21,6 +21,7 @@ * IN THE SOFTWARE. */ +#include "util/half_float.h" #include "brw_fs.h" #include "brw_cfg.h" #include "brw_fs_builder.h" @@ -33,7 +34,8 @@ fs_visitor::lower_pack() bool progress = false; foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { - if (inst->opcode != FS_OPCODE_PACK) + if (inst->opcode != FS_OPCODE_PACK && + inst->opcode != FS_OPCODE_PACK_HALF_2x16_SPLIT) continue; assert(inst->dst.file == VGRF); @@ -48,8 +50,36 @@ fs_visitor::lower_pack() */ if (!inst->is_partial_write()) ibld.emit_undef_for_dst(inst); - for (unsigned i = 0; i < inst->sources; i++) - ibld.MOV(subscript(dst, inst->src[i].type, i), inst->src[i]); + + switch (inst->opcode) { + case FS_OPCODE_PACK: + for (unsigned i = 0; i < inst->sources; i++) + ibld.MOV(subscript(dst, inst->src[i].type, i), inst->src[i]); + break; + case FS_OPCODE_PACK_HALF_2x16_SPLIT: + assert(dst.type == BRW_REGISTER_TYPE_UD); + + for (unsigned i = 0; i < inst->sources; i++) { + if (inst->src[i].file == IMM) { + const uint32_t half = _mesa_float_to_half(inst->src[i].f); + ibld.MOV(subscript(dst, BRW_REGISTER_TYPE_UW, i), + brw_imm_uw(half)); + } else if (i == 1 && devinfo->ver < 9) { + /* Pre-Skylake requires DWord aligned destinations */ + fs_reg tmp = ibld.vgrf(BRW_REGISTER_TYPE_UD); + ibld.F32TO16(subscript(tmp, BRW_REGISTER_TYPE_HF, 0), + inst->src[i]); + ibld.MOV(subscript(dst, BRW_REGISTER_TYPE_UW, 1), + subscript(tmp, BRW_REGISTER_TYPE_UW, 0)); + } else { + ibld.F32TO16(subscript(dst, BRW_REGISTER_TYPE_HF, i), + inst->src[i]); + } + } + break; + default: + unreachable("skipped above"); + } inst->remove(block); progress = true; -- 2.7.4