From 756d37b1d6d09ad7ee3b8835888a49d4256e427b Mon Sep 17 00:00:00 2001 From: Francisco Jerez Date: Sun, 8 Dec 2013 04:57:35 +0100 Subject: [PATCH] i965/fs: Add support for specifying register horizontal strides. v2: Some improvements for copy propagation with non-contiguous register strides and mismatching types. v3: Add example of the situation that the copy propagation changes are intended to avoid. Clarify that 'fs_reg::apply_stride()' is expected to work with zero strides too. Reviewed-by: Matt Turner Reviewed-by: Paul Berry --- src/mesa/drivers/dri/i965/brw_fs.cpp | 24 +++++++++++++-- src/mesa/drivers/dri/i965/brw_fs.h | 5 +++ .../drivers/dri/i965/brw_fs_copy_propagation.cpp | 36 ++++++++++++++++++++-- src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 8 +++-- .../drivers/dri/i965/brw_fs_live_variables.cpp | 2 +- src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp | 4 +-- 6 files changed, 68 insertions(+), 11 deletions(-) diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index d82e3ad..b8f7cb4 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -381,6 +381,7 @@ fs_reg::init() { memset(this, 0, sizeof(*this)); this->smear = -1; + stride = 1; } /** Generic unset register constructor. */ @@ -440,6 +441,7 @@ fs_reg::equals(const fs_reg &r) const memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 && smear == r.smear && + stride == r.stride && imm.u == r.imm.u); } @@ -451,6 +453,22 @@ fs_reg::retype(uint32_t type) return result; } +fs_reg & +fs_reg::apply_stride(unsigned stride) +{ + assert((this->stride * stride) <= 4 && + (is_power_of_two(stride) || stride == 0) && + file != HW_REG && file != IMM); + this->stride *= stride; + return *this; +} + +bool +fs_reg::is_contiguous() const +{ + return stride == 1; +} + bool fs_reg::is_zero() const { @@ -708,7 +726,7 @@ fs_inst::is_partial_write() { return ((this->predicate && this->opcode != BRW_OPCODE_SEL) || this->force_uncompressed || - this->force_sechalf); + this->force_sechalf || !this->dst.is_contiguous()); } int @@ -2317,6 +2335,7 @@ fs_visitor::register_coalesce() inst->src[0].negate || inst->src[0].abs || inst->src[0].smear != -1 || + !inst->src[0].is_contiguous() || inst->dst.file != GRF || inst->dst.type != inst->src[0].type) { continue; @@ -2477,7 +2496,8 @@ fs_visitor::compute_to_mrf() inst->dst.file != MRF || inst->src[0].file != GRF || inst->dst.type != inst->src[0].type || inst->src[0].abs || inst->src[0].negate || - inst->src[0].smear != -1 || inst->src[0].subreg_offset) + inst->src[0].smear != -1 || !inst->src[0].is_contiguous() || + inst->src[0].subreg_offset) continue; /* Work out which hardware MRF registers are written by this diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 2ec2ee7..84bed9f 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -82,7 +82,9 @@ public: bool is_one() const; bool is_null() const; bool is_valid_3src() const; + bool is_contiguous() const; fs_reg retype(uint32_t type); + fs_reg &apply_stride(unsigned stride); /** Register file: GRF, MRF, IMM. */ enum register_file file; @@ -120,6 +122,9 @@ public: */ int subreg_offset; + /** Register region horizontal stride */ + int stride; + fs_reg *reladdr; }; diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp index 5cf019c..cdb7b80 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp @@ -279,7 +279,9 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) if (entry->src.file == IMM) return false; - if (inst->regs_read(this, arg) > 1) + /* Bail if inst is reading more than entry is writing. */ + if ((inst->regs_read(this, arg) * inst->src[arg].stride * + type_sz(inst->src[arg].type)) > type_sz(entry->dst.type)) return false; if (inst->src[arg].file != entry->dst.file || @@ -298,7 +300,32 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) bool has_source_modifiers = entry->src.abs || entry->src.negate; if ((has_source_modifiers || entry->src.file == UNIFORM || - entry->src.smear != -1) && !can_do_source_mods(inst)) + entry->src.smear != -1 || !entry->src.is_contiguous()) && + !can_do_source_mods(inst)) + return false; + + /* Bail if the result of composing both strides would exceed the + * hardware limit. + */ + if (entry->src.stride * inst->src[arg].stride > 4) + return false; + + /* Bail if the result of composing both strides cannot be expressed + * as another stride. This avoids, for example, trying to transform + * this: + * + * MOV (8) rX<1>UD rY<0;1,0>UD + * FOO (8) ... rX<8;8,1>UW + * + * into this: + * + * FOO (8) ... rY<0;1,0>UW + * + * Which would have different semantics. + */ + if (entry->src.stride != 1 && + (inst->src[arg].stride * + type_sz(inst->src[arg].type)) % type_sz(entry->src.type) != 0) return false; if (has_source_modifiers && entry->dst.type != inst->src[arg].type) @@ -310,6 +337,7 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) if (entry->src.smear != -1) inst->src[arg].smear = entry->src.smear; inst->src[arg].subreg_offset = entry->src.subreg_offset; + inst->src[arg].stride *= entry->src.stride; if (!inst->src[arg].abs) { inst->src[arg].abs = entry->src.abs; @@ -332,7 +360,9 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry) if (inst->src[i].file != entry->dst.file || inst->src[i].reg != entry->dst.reg || inst->src[i].reg_offset != entry->dst.reg_offset || - inst->src[i].subreg_offset != entry->dst.subreg_offset) + inst->src[i].subreg_offset != entry->dst.subreg_offset || + inst->src[i].type != entry->dst.type || + inst->src[i].stride > 1) continue; /* Don't bother with cases that should have been taken care of by the diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index 0d7e8c5..310f993 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -1030,11 +1030,13 @@ brw_reg_from_fs_reg(fs_reg *reg) switch (reg->file) { case GRF: case MRF: - if (reg->smear == -1) { - brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0); + if (reg->stride == 0 || reg->smear >= 0) { + brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, reg->smear); } else { - brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, reg->smear); + brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0); + brw_reg = stride(brw_reg, 8 * reg->stride, 8, reg->stride); } + brw_reg = retype(brw_reg, reg->type); if (reg->sechalf) brw_reg = sechalf(brw_reg); diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp index a2d88aa..2aeabad 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp @@ -85,7 +85,7 @@ fs_live_variables::setup_one_read(bblock_t *block, fs_inst *inst, * would get stomped by the first decode as well. */ int end_ip = ip; - if (v->dispatch_width == 16 && (reg.smear != -1 || + if (v->dispatch_width == 16 && (reg.smear != -1 || reg.stride == 0 || (v->pixel_x.reg == reg.reg || v->pixel_y.reg == reg.reg))) { end_ip++; diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp index fb05aec..fcd66d9 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp @@ -592,7 +592,7 @@ fs_visitor::choose_spill_reg(struct ra_graph *g) * loading pull constants, so spilling them is unlikely to reduce * register pressure anyhow. */ - if (inst->src[i].smear >= 0) { + if (inst->src[i].smear >= 0 || !inst->src[i].is_contiguous()) { no_spill[inst->src[i].reg] = true; } } @@ -601,7 +601,7 @@ fs_visitor::choose_spill_reg(struct ra_graph *g) if (inst->dst.file == GRF) { spill_costs[inst->dst.reg] += inst->regs_written * loop_scale; - if (inst->dst.smear >= 0) { + if (inst->dst.smear >= 0 || !inst->dst.is_contiguous()) { no_spill[inst->dst.reg] = true; } } -- 2.7.4