is_f32_to_f16_conversion(nir_instr *instr)
{
return is_n_to_m_conversion(instr, 32, nir_op_f2f16) ||
- is_n_to_m_conversion(instr, 32, nir_op_f2f16_rtne) ||
is_n_to_m_conversion(instr, 32, nir_op_f2fmp);
}
is_i32_to_i16_conversion(nir_instr *instr)
{
return is_n_to_m_conversion(instr, 32, nir_op_i2i16) ||
- is_n_to_m_conversion(instr, 32, nir_op_u2u16);
-}
-
-static void
-replace_with_mov(nir_builder *b, nir_instr *instr, nir_src *src,
- nir_alu_instr *alu)
-{
- nir_ssa_def *mov = nir_mov_alu(b, alu->src[0],
- nir_dest_num_components(alu->dest.dest));
- assert(!alu->dest.saturate);
- nir_instr_rewrite_src_ssa(instr, src, mov);
-}
-
-/**
- * If texture source operands use f16->f32 conversions or return values are
- * followed by f16->f32 or f32->f16, remove those conversions. This benefits
- * drivers that have texture opcodes that can accept and return 16-bit types.
- *
- * "tex_src_types" is a mask of nir_tex_src_* operands that should be handled.
- * It's always done for the destination.
- *
- * This should be run after late algebraic optimizations.
- * Copy propagation and DCE should be run after this.
- */
-bool
-nir_fold_16bit_sampler_conversions(nir_shader *nir,
- unsigned tex_src_types,
- uint32_t sampler_dims)
-{
- bool changed = false;
- nir_function_impl *impl = nir_shader_get_entrypoint(nir);
- assert(impl);
-
- nir_builder b;
- nir_builder_init(&b, impl);
-
- nir_foreach_block_safe (block, impl) {
- nir_foreach_instr_safe (instr, block) {
- if (instr->type != nir_instr_type_tex)
- continue;
-
- nir_tex_instr *tex = nir_instr_as_tex(instr);
- nir_instr *src;
- nir_alu_instr *src_alu;
-
- /* Skip sparse residency */
- if (tex->is_sparse)
- continue;
-
- if ((tex->op == nir_texop_txs ||
- tex->op == nir_texop_query_levels) ||
- !(sampler_dims & BITFIELD_BIT(tex->sampler_dim)))
- continue;
-
- /* Optimize source operands. */
- for (unsigned i = 0; i < tex->num_srcs; i++) {
- /* Filter out sources that should be ignored. */
- if (!(BITFIELD_BIT(tex->src[i].src_type) & tex_src_types))
- continue;
-
- src = tex->src[i].src.ssa->parent_instr;
- if (src->type != nir_instr_type_alu)
- continue;
-
- src_alu = nir_instr_as_alu(src);
- b.cursor = nir_before_instr(src);
-
- nir_alu_type src_type = nir_tex_instr_src_type(tex, i);
-
- /* Handle vector sources that are made of scalar instructions. */
- if (nir_op_is_vec(src_alu->op)) {
- /* See if the vector is made of f16->f32 opcodes. */
- unsigned num = nir_dest_num_components(src_alu->dest.dest);
- bool is_f16_to_f32 = src_type == nir_type_float;
- bool is_u16_to_u32 = src_type & (nir_type_int | nir_type_uint);
-
- for (unsigned comp = 0; comp < num; comp++) {
- nir_instr *instr = src_alu->src[comp].src.ssa->parent_instr;
- is_f16_to_f32 &= is_f16_to_f32_conversion(instr);
- /* Zero-extension (u16) and sign-extension (i16) have
- * the same behavior here - txf returns 0 if bit 15 is set
- * because it's out of bounds and the higher bits don't
- * matter.
- */
- is_u16_to_u32 &= is_u16_to_u32_conversion(instr) ||
- is_i16_to_i32_conversion(instr);
- }
-
- if (!is_f16_to_f32 && !is_u16_to_u32)
- continue;
-
- nir_alu_instr *new_vec = nir_alu_instr_clone(nir, src_alu);
- nir_instr_insert_after(&src_alu->instr, &new_vec->instr);
-
- /* Replace conversions with mov. */
- for (unsigned comp = 0; comp < num; comp++) {
- nir_instr *instr = new_vec->src[comp].src.ssa->parent_instr;
- replace_with_mov(&b, &new_vec->instr,
- &new_vec->src[comp].src,
- nir_instr_as_alu(instr));
- }
-
- new_vec->dest.dest.ssa.bit_size =
- new_vec->src[0].src.ssa->bit_size;
- nir_instr_rewrite_src_ssa(&tex->instr, &tex->src[i].src,
- &new_vec->dest.dest.ssa);
- changed = true;
- } else if ((is_f16_to_f32_conversion(&src_alu->instr) &&
- src_type == nir_type_float) ||
- ((is_u16_to_u32_conversion(&src_alu->instr) ||
- is_i16_to_i32_conversion(&src_alu->instr)) &&
- src_type & (nir_type_int | nir_type_uint))) {
- /* Handle scalar sources. */
- replace_with_mov(&b, &tex->instr, &tex->src[i].src, src_alu);
- changed = true;
- }
- }
-
- /* Optimize the destination. */
- bool is_f32_to_f16 = tex->dest_type & nir_type_float;
- /* same behavior for int and uint */
- bool is_i32_to_i16 = tex->dest_type & (nir_type_int | nir_type_uint);
-
- nir_foreach_use(use, &tex->dest.ssa) {
- is_f32_to_f16 &= is_f32_to_f16_conversion(use->parent_instr);
- is_i32_to_i16 &= is_i32_to_i16_conversion(use->parent_instr);
- }
-
- if (is_f32_to_f16 || is_i32_to_i16) {
- /* All uses are the same conversions. Replace them with mov. */
- nir_foreach_use(use, &tex->dest.ssa) {
- nir_alu_instr *conv = nir_instr_as_alu(use->parent_instr);
- conv->op = nir_op_mov;
- tex->dest.ssa.bit_size = conv->dest.dest.ssa.bit_size;
- tex->dest_type = (tex->dest_type & (~16 & ~32 & ~64)) |
- conv->dest.dest.ssa.bit_size;
- }
- changed = true;
- }
- }
- }
-
- if (changed) {
- nir_metadata_preserve(impl, nir_metadata_dominance |
- nir_metadata_block_index);
- } else {
- nir_metadata_preserve(impl, nir_metadata_all);
- }
-
- return changed;
+ is_n_to_m_conversion(instr, 32, nir_op_u2u16) ||
+ is_n_to_m_conversion(instr, 32, nir_op_i2imp);
}
/**
}
static bool
-fold_16bit_store_data(nir_builder *b, nir_intrinsic_instr *instr)
+can_fold_16bit_src(nir_ssa_def *ssa, nir_alu_type src_type, bool sext_matters)
{
- nir_alu_type src_type = nir_intrinsic_src_type(instr);
- nir_src *data_src = &instr->src[3];
-
- b->cursor = nir_before_instr(&instr->instr);
-
bool fold_f16 = src_type == nir_type_float32;
- bool fold_u16 = src_type == nir_type_uint32;
- bool fold_i16 = src_type == nir_type_int32;
-
- nir_ssa_scalar comps[NIR_MAX_VEC_COMPONENTS];
- for (unsigned i = 0; i < instr->num_components; i++) {
- comps[i] = nir_ssa_scalar_resolved(data_src->ssa, i);
- if (comps[i].def->parent_instr->type == nir_instr_type_ssa_undef)
+ bool fold_u16 = src_type == nir_type_uint32 && sext_matters;
+ bool fold_i16 = src_type == nir_type_int32 && sext_matters;
+ bool fold_i16_u16 = (src_type == nir_type_uint32 || src_type == nir_type_int32) && !sext_matters;
+
+ bool can_fold = fold_f16 || fold_u16 || fold_i16 || fold_i16_u16;
+ for (unsigned i = 0; can_fold && i < ssa->num_components; i++) {
+ nir_ssa_scalar comp = nir_ssa_scalar_resolved(ssa, i);
+ if (comp.def->parent_instr->type == nir_instr_type_ssa_undef)
continue;
- else if (nir_ssa_scalar_is_const(comps[i])) {
- fold_f16 &= const_is_f16(comps[i]);
- fold_u16 &= const_is_u16(comps[i]);
- fold_i16 &= const_is_i16(comps[i]);
+ else if (nir_ssa_scalar_is_const(comp)) {
+ if (fold_f16)
+ can_fold &= const_is_f16(comp);
+ else if (fold_u16)
+ can_fold &= const_is_u16(comp);
+ else if (fold_i16)
+ can_fold &= const_is_i16(comp);
+ else if (fold_i16_u16)
+ can_fold &= (const_is_u16(comp) || const_is_i16(comp));
} else {
- fold_f16 &= is_f16_to_f32_conversion(comps[i].def->parent_instr);
- fold_u16 &= is_u16_to_u32_conversion(comps[i].def->parent_instr);
- fold_i16 &= is_i16_to_i32_conversion(comps[i].def->parent_instr);
+ if (fold_f16)
+ can_fold &= is_f16_to_f32_conversion(comp.def->parent_instr);
+ else if (fold_u16)
+ can_fold &= is_u16_to_u32_conversion(comp.def->parent_instr);
+ else if (fold_i16)
+ can_fold &= is_i16_to_i32_conversion(comp.def->parent_instr);
+ else if (fold_i16_u16)
+ can_fold &= (is_i16_to_i32_conversion(comp.def->parent_instr) ||
+ is_u16_to_u32_conversion(comp.def->parent_instr));
}
}
- if (!fold_f16 && !fold_u16 && !fold_i16)
- return false;
+ return can_fold;
+}
+
+static void
+fold_16bit_src(nir_builder *b, nir_instr *instr, nir_src *src, nir_alu_type src_type)
+{
+ b->cursor = nir_before_instr(instr);
nir_ssa_scalar new_comps[NIR_MAX_VEC_COMPONENTS];
- for (unsigned i = 0; i < instr->num_components; i++) {
- if (comps[i].def->parent_instr->type == nir_instr_type_ssa_undef)
+ for (unsigned i = 0; i < src->ssa->num_components; i++) {
+ nir_ssa_scalar comp = nir_ssa_scalar_resolved(src->ssa, i);
+
+ if (comp.def->parent_instr->type == nir_instr_type_ssa_undef)
new_comps[i] = nir_get_ssa_scalar(nir_ssa_undef(b, 1, 16), 0);
- else if (nir_ssa_scalar_is_const(comps[i])) {
+ else if (nir_ssa_scalar_is_const(comp)) {
nir_ssa_def *constant;
if (src_type == nir_type_float32)
- constant = nir_imm_float16(b, nir_ssa_scalar_as_float(comps[i]));
+ constant = nir_imm_float16(b, nir_ssa_scalar_as_float(comp));
else
- constant = nir_imm_intN_t(b, nir_ssa_scalar_as_uint(comps[i]), 16);
+ constant = nir_imm_intN_t(b, nir_ssa_scalar_as_uint(comp), 16);
new_comps[i] = nir_get_ssa_scalar(constant, 0);
} else {
/* conversion instruction */
- new_comps[i] = nir_ssa_scalar_chase_alu_src(comps[i], 0);
+ new_comps[i] = nir_ssa_scalar_chase_alu_src(comp, 0);
}
}
- nir_ssa_def *new_vec = nir_vec_scalars(b, new_comps, instr->num_components);
+ nir_ssa_def *new_vec = nir_vec_scalars(b, new_comps, src->ssa->num_components);
+
+ nir_instr_rewrite_src_ssa(instr, src, new_vec);
+}
+
+static bool
+fold_16bit_store_data(nir_builder *b, nir_intrinsic_instr *instr)
+{
+ nir_alu_type src_type = nir_intrinsic_src_type(instr);
+ nir_src *data_src = &instr->src[3];
+
+ b->cursor = nir_before_instr(&instr->instr);
+
+ if (!can_fold_16bit_src(data_src->ssa, src_type, true))
+ return false;
- nir_instr_rewrite_src_ssa(&instr->instr, data_src, new_vec);
+ fold_16bit_src(b, &instr->instr, data_src, src_type);
nir_intrinsic_set_src_type(instr, (src_type & ~32) | 16);
}
static bool
-fold_16bit_load_data(nir_builder *b, nir_intrinsic_instr *instr)
+fold_16bit_destination(nir_ssa_def *ssa, nir_alu_type dest_type,
+ unsigned exec_mode, nir_rounding_mode rdm)
{
- nir_alu_type dest_type = nir_intrinsic_dest_type(instr);
-
- if (dest_type == nir_type_float32 &&
- nir_has_any_rounding_mode_enabled(b->shader->info.float_controls_execution_mode))
- return false;
-
bool is_f32_to_f16 = dest_type == nir_type_float32;
bool is_i32_to_i16 = dest_type == nir_type_int32 || dest_type == nir_type_uint32;
- nir_foreach_use(use, &instr->dest.ssa) {
- is_f32_to_f16 &= is_f32_to_f16_conversion(use->parent_instr);
- is_i32_to_i16 &= is_i32_to_i16_conversion(use->parent_instr);
+ nir_rounding_mode src_rdm =
+ nir_get_rounding_mode_from_float_controls(exec_mode, nir_type_float16);
+ bool allow_standard = (src_rdm == rdm || src_rdm == nir_rounding_mode_undef);
+ bool allow_rtz = rdm == nir_rounding_mode_rtz;
+ bool allow_rtne = rdm == nir_rounding_mode_rtne;
+
+ nir_foreach_use(use, ssa) {
+ nir_instr *instr = use->parent_instr;
+ is_f32_to_f16 &= (allow_standard && is_f32_to_f16_conversion(instr)) ||
+ (allow_rtz && is_n_to_m_conversion(instr, 32, nir_op_f2f16_rtz)) ||
+ (allow_rtne && is_n_to_m_conversion(instr, 32, nir_op_f2f16_rtne));
+ is_i32_to_i16 &= is_i32_to_i16_conversion(instr);
}
if (!is_f32_to_f16 && !is_i32_to_i16)
return false;
/* All uses are the same conversions. Replace them with mov. */
- nir_foreach_use(use, &instr->dest.ssa) {
+ nir_foreach_use(use, ssa) {
nir_alu_instr *conv = nir_instr_as_alu(use->parent_instr);
conv->op = nir_op_mov;
}
- instr->dest.ssa.bit_size = 16;
+ ssa->bit_size = 16;
+ return true;
+}
+
+static bool
+fold_16bit_load_data(nir_builder *b, nir_intrinsic_instr *instr,
+ unsigned exec_mode, nir_rounding_mode rdm)
+{
+ nir_alu_type dest_type = nir_intrinsic_dest_type(instr);
+
+ if (!fold_16bit_destination(&instr->dest.ssa, dest_type, exec_mode, rdm))
+ return false;
+
nir_intrinsic_set_dest_type(instr, (dest_type & ~32) | 16);
return true;
}
static bool
-fold_16bit_image_load_store(nir_builder *b, nir_instr *instr, UNUSED void *unused)
+fold_16bit_tex_dest(nir_tex_instr *tex, unsigned exec_mode,
+ nir_rounding_mode rdm)
{
- if (instr->type != nir_instr_type_intrinsic)
+ /* Skip sparse residency */
+ if (tex->is_sparse)
return false;
- nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr);
+ if (tex->op != nir_texop_tex &&
+ tex->op != nir_texop_txb &&
+ tex->op != nir_texop_txd &&
+ tex->op != nir_texop_txl &&
+ tex->op != nir_texop_txf &&
+ tex->op != nir_texop_txf_ms &&
+ tex->op != nir_texop_tg4 &&
+ tex->op != nir_texop_tex_prefetch &&
+ tex->op != nir_texop_fragment_fetch_amd)
+ return false;
+ if (!fold_16bit_destination(&tex->dest.ssa, tex->dest_type, exec_mode, rdm))
+ return false;
+
+ tex->dest_type = (tex->dest_type & ~32) | 16;
+ return true;
+}
+
+
+static bool
+fold_16bit_tex_srcs(nir_builder *b, nir_tex_instr *tex,
+ struct nir_fold_tex_srcs_options *options)
+{
+ if (tex->op != nir_texop_tex &&
+ tex->op != nir_texop_txb &&
+ tex->op != nir_texop_txd &&
+ tex->op != nir_texop_txl &&
+ tex->op != nir_texop_txf &&
+ tex->op != nir_texop_txf_ms &&
+ tex->op != nir_texop_tg4 &&
+ tex->op != nir_texop_tex_prefetch &&
+ tex->op != nir_texop_fragment_fetch_amd &&
+ tex->op != nir_texop_fragment_mask_fetch_amd)
+ return false;
+
+ if (!(options->sampler_dims & BITFIELD_BIT(tex->sampler_dim)))
+ return false;
+
+ bool changed = false;
+ for (unsigned i = 0; i < tex->num_srcs; i++) {
+ /* Filter out sources that should be ignored. */
+ if (!(BITFIELD_BIT(tex->src[i].src_type) & options->src_types))
+ continue;
+
+ nir_src *src = &tex->src[i].src;
+
+ nir_alu_type src_type = nir_tex_instr_src_type(tex, i) | src->ssa->bit_size;
+
+ /* Zero-extension (u16) and sign-extension (i16) have
+ * the same behavior here - txf returns 0 if bit 15 is set
+ * because it's out of bounds and the higher bits don't
+ * matter.
+ */
+ if (!can_fold_16bit_src(src->ssa, src_type, false))
+ continue;
+
+ fold_16bit_src(b, &tex->instr, src, src_type);
+ changed = true;
+ }
+
+ return changed;
+}
+
+static bool
+fold_16bit_tex_image(nir_builder *b, nir_instr *instr, void *params)
+{
+ struct nir_fold_16bit_tex_image_options *options = params;
+ unsigned exec_mode = b->shader->info.float_controls_execution_mode;
bool progress = false;
- switch (intrinsic->intrinsic) {
- case nir_intrinsic_bindless_image_store:
- case nir_intrinsic_image_deref_store:
- case nir_intrinsic_image_store:
- progress |= fold_16bit_store_data(b, intrinsic);
- break;
- case nir_intrinsic_bindless_image_load:
- case nir_intrinsic_image_deref_load:
- case nir_intrinsic_image_load:
- progress |= fold_16bit_load_data(b, intrinsic);
- break;
- default:
- break;
+ if (instr->type == nir_instr_type_intrinsic) {
+ nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr);
+
+ switch (intrinsic->intrinsic) {
+ case nir_intrinsic_bindless_image_store:
+ case nir_intrinsic_image_deref_store:
+ case nir_intrinsic_image_store:
+ if (options->fold_image_load_store_data)
+ progress |= fold_16bit_store_data(b, intrinsic);
+ break;
+ case nir_intrinsic_bindless_image_load:
+ case nir_intrinsic_image_deref_load:
+ case nir_intrinsic_image_load:
+ if (options->fold_image_load_store_data)
+ progress |= fold_16bit_load_data(b, intrinsic, exec_mode, options->rounding_mode);
+ break;
+ default:
+ break;
+ }
+ } else if (instr->type == nir_instr_type_tex) {
+ nir_tex_instr *tex = nir_instr_as_tex(instr);
+
+ if (options->fold_tex_dest)
+ progress |= fold_16bit_tex_dest(tex, exec_mode, options->rounding_mode);
+
+ for (unsigned i = 0; i < options->fold_srcs_options_count; i++) {
+ progress |= fold_16bit_tex_srcs(b, tex, &options->fold_srcs_options[i]);
+ }
}
return progress;
}
-bool
-nir_fold_16bit_image_load_store_conversions(nir_shader *nir)
+bool nir_fold_16bit_tex_image(nir_shader *nir,
+ struct nir_fold_16bit_tex_image_options *options)
{
return nir_shader_instructions_pass(nir,
- fold_16bit_image_load_store,
+ fold_16bit_tex_image,
nir_metadata_block_index | nir_metadata_dominance,
- NULL);
+ options);
}