QUNIFORM_ALPHA_REF,
QUNIFORM_SAMPLE_MASK,
+
+ /**
+ * Returns the the offset of the scratch buffer for register spilling.
+ */
+ QUNIFORM_SPILL_OFFSET,
+ QUNIFORM_SPILL_SIZE_PER_THREAD,
};
struct v3d_varying_slot {
uint8_t vattr_sizes[V3D_MAX_VS_INPUTS];
uint32_t num_vpm_writes;
+ /* Size in bytes of registers that have been spilled. This is how much
+ * space needs to be available in the spill BO per thread per QPU.
+ */
+ uint32_t spill_size;
+ /* Shader-db stats for register spilling. */
+ uint32_t spills, fills;
+ /**
+ * Register spilling's per-thread base address, shared between each
+ * spill/fill's addressing calculations.
+ */
+ struct qreg spill_base;
+ /* Bit vector of which temps may be spilled */
+ BITSET_WORD *spillable;
+
/**
* Array of the VARYING_SLOT_* of all FS QFILE_VARY reads.
*
struct v3d_ubo_range *ubo_ranges;
uint32_t num_ubo_ranges;
uint32_t ubo_size;
+ uint32_t spill_size;
uint8_t num_inputs;
uint8_t threads;
enum v3d_qpu_input_unpack unpack);
struct qreg vir_get_temp(struct v3d_compile *c);
+void vir_emit_last_thrsw(struct v3d_compile *c);
void vir_calculate_live_intervals(struct v3d_compile *c);
bool vir_has_implicit_uniform(struct qinst *inst);
int vir_get_implicit_uniform_src(struct qinst *inst);
void v3d_vir_to_qpu(struct v3d_compile *c, struct qpu_reg *temp_registers);
uint32_t v3d_qpu_schedule_instructions(struct v3d_compile *c);
void qpu_validate(struct v3d_compile *c);
-struct qpu_reg *v3d_register_allocate(struct v3d_compile *c);
+struct qpu_reg *v3d_register_allocate(struct v3d_compile *c, bool *spilled);
bool vir_init_reg_sets(struct v3d_compiler *compiler);
void vir_PF(struct v3d_compile *c, struct qreg src, enum v3d_qpu_pf pf);
#define PHYS_INDEX (ACC_INDEX + ACC_COUNT)
#define PHYS_COUNT 64
+static bool
+is_last_ldtmu(struct qinst *inst, struct qblock *block)
+{
+ list_for_each_entry_from(struct qinst, scan_inst, inst,
+ &block->instructions, link) {
+ if (inst->qpu.sig.ldtmu)
+ return false;
+ if (v3d_qpu_writes_tmu(&inst->qpu))
+ return true;
+ }
+
+ return true;
+}
+
+static int
+v3d_choose_spill_node(struct v3d_compile *c, struct ra_graph *g,
+ uint32_t *temp_to_node)
+{
+ float block_scale = 1.0;
+ float spill_costs[c->num_temps];
+ bool in_tmu_operation = false;
+ bool started_last_seg = false;
+
+ for (unsigned i = 0; i < c->num_temps; i++)
+ spill_costs[i] = 0.0;
+
+ /* XXX: Scale the cost up when inside of a loop. */
+ vir_for_each_block(block, c) {
+ vir_for_each_inst(inst, block) {
+ /* We can't insert a new TMU operation while currently
+ * in a TMU operation, and we can't insert new thread
+ * switches after starting output writes.
+ */
+ bool no_spilling =
+ (in_tmu_operation ||
+ (c->threads > 1 && started_last_seg));
+
+ for (int i = 0; i < vir_get_nsrc(inst); i++) {
+ if (inst->src[i].file != QFILE_TEMP)
+ continue;
+
+ int temp = inst->src[i].index;
+ if (no_spilling) {
+ BITSET_CLEAR(c->spillable,
+ temp);
+ } else {
+ spill_costs[temp] += block_scale;
+ }
+ }
+
+ if (inst->dst.file == QFILE_TEMP) {
+ int temp = inst->dst.index;
+
+ if (no_spilling) {
+ BITSET_CLEAR(c->spillable,
+ temp);
+ } else {
+ spill_costs[temp] += block_scale;
+ }
+ }
+
+ if (inst->is_last_thrsw)
+ started_last_seg = true;
+
+ if (v3d_qpu_writes_vpm(&inst->qpu) ||
+ v3d_qpu_uses_tlb(&inst->qpu))
+ started_last_seg = true;
+
+ /* Track when we're in between a TMU setup and the
+ * final LDTMU from that TMU setup. We can't
+ * spill/fill any temps during that time, because that
+ * involves inserting a new TMU setup/LDTMU sequence.
+ */
+ if (inst->qpu.sig.ldtmu &&
+ is_last_ldtmu(inst, block))
+ in_tmu_operation = false;
+
+ if (v3d_qpu_writes_tmu(&inst->qpu))
+ in_tmu_operation = true;
+ }
+ }
+
+ for (unsigned i = 0; i < c->num_temps; i++) {
+ int node = temp_to_node[i];
+
+ if (BITSET_TEST(c->spillable, i))
+ ra_set_node_spill_cost(g, node, spill_costs[i]);
+ }
+
+ return ra_get_best_spill_node(g);
+}
+
+/* The spill offset for this thread takes a bit of setup, so do it once at
+ * program start.
+ */
+static void
+v3d_setup_spill_base(struct v3d_compile *c)
+{
+ c->cursor = vir_before_block(vir_entry_block(c));
+
+ int start_num_temps = c->num_temps;
+
+ /* Each thread wants to be in a separate region of the scratch space
+ * so that the QPUs aren't fighting over cache lines. We have the
+ * driver keep a single global spill BO rather than
+ * per-spilling-program BOs, so we need a uniform from the driver for
+ * what the per-thread scale is.
+ */
+ struct qreg thread_offset =
+ vir_UMUL(c,
+ vir_TIDX(c),
+ vir_uniform(c, QUNIFORM_SPILL_SIZE_PER_THREAD, 0));
+
+ /* Each channel in a reg is 4 bytes, so scale them up by that. */
+ struct qreg element_offset = vir_SHL(c, vir_EIDX(c),
+ vir_uniform_ui(c, 2));
+
+ c->spill_base = vir_ADD(c,
+ vir_ADD(c, thread_offset, element_offset),
+ vir_uniform(c, QUNIFORM_SPILL_OFFSET, 0));
+
+ /* Make sure that we don't spill the spilling setup instructions. */
+ for (int i = start_num_temps; i < c->num_temps; i++)
+ BITSET_CLEAR(c->spillable, i);
+}
+
+static void
+v3d_emit_spill_tmua(struct v3d_compile *c, uint32_t spill_offset)
+{
+ vir_ADD_dest(c, vir_reg(QFILE_MAGIC,
+ V3D_QPU_WADDR_TMUA),
+ c->spill_base,
+ vir_uniform_ui(c, spill_offset));
+}
+
+static void
+v3d_spill_reg(struct v3d_compile *c, int spill_temp)
+{
+ uint32_t spill_offset = c->spill_size;
+ c->spill_size += 16 * sizeof(uint32_t);
+
+ if (spill_offset == 0)
+ v3d_setup_spill_base(c);
+
+ struct qinst *last_thrsw = c->last_thrsw;
+ assert(!last_thrsw || last_thrsw->is_last_thrsw);
+
+ int start_num_temps = c->num_temps;
+
+ vir_for_each_inst_inorder(inst, c) {
+ for (int i = 0; i < vir_get_nsrc(inst); i++) {
+ if (inst->src[i].file != QFILE_TEMP ||
+ inst->src[i].index != spill_temp) {
+ continue;
+ }
+
+ c->cursor = vir_before_inst(inst);
+
+ v3d_emit_spill_tmua(c, spill_offset);
+ vir_emit_thrsw(c);
+ inst->src[i] = vir_LDTMU(c);
+ c->fills++;
+ }
+
+ if (inst->dst.file == QFILE_TEMP &&
+ inst->dst.index == spill_temp) {
+ c->cursor = vir_after_inst(inst);
+
+ inst->dst.index = c->num_temps++;
+ vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD),
+ inst->dst);
+ v3d_emit_spill_tmua(c, spill_offset);
+ vir_emit_thrsw(c);
+ c->spills++;
+ }
+
+ /* If we didn't have a last-thrsw inserted by nir_to_vir and
+ * we've been inserting thrsws, then insert a new last_thrsw
+ * right before we start the vpm/tlb sequence for the last
+ * thread segment.
+ */
+ if (!last_thrsw && c->last_thrsw &&
+ (v3d_qpu_writes_vpm(&inst->qpu) ||
+ v3d_qpu_uses_tlb(&inst->qpu))) {
+ c->cursor = vir_before_inst(inst);
+ vir_emit_thrsw(c);
+
+ last_thrsw = c->last_thrsw;
+ last_thrsw->is_last_thrsw = true;
+ }
+ }
+
+ /* Make sure c->last_thrsw is the actual last thrsw, not just one we
+ * inserted in our most recent unspill.
+ */
+ if (last_thrsw)
+ c->last_thrsw = last_thrsw;
+
+ /* Don't allow spilling of our spilling instructions. There's no way
+ * they can help get things colored.
+ */
+ for (int i = start_num_temps; i < c->num_temps; i++)
+ BITSET_CLEAR(c->spillable, i);
+}
+
bool
vir_init_reg_sets(struct v3d_compiler *compiler)
{
* The return value should be freed by the caller.
*/
struct qpu_reg *
-v3d_register_allocate(struct v3d_compile *c)
+v3d_register_allocate(struct v3d_compile *c, bool *spilled)
{
struct node_to_temp_map map[c->num_temps];
uint32_t temp_to_node[c->num_temps];
sizeof(*temp_registers));
int acc_nodes[ACC_COUNT];
- struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,
- c->num_temps +
- ARRAY_SIZE(acc_nodes));
+ *spilled = false;
+
+ vir_calculate_live_intervals(c);
+
/* Convert 1, 2, 4 threads to 0, 1, 2 index.
*
* V3D 4.x has double the physical register space, so 64 physical regs
thread_index--;
}
+ struct ra_graph *g = ra_alloc_interference_graph(c->compiler->regs,
+ c->num_temps +
+ ARRAY_SIZE(acc_nodes));
+
/* Make some fixed nodes for the accumulators, which we will need to
* interfere with when ops have implied r3/r4 writes or for the thread
* switches. We could represent these as classes for the nodes to
bool ok = ra_allocate(g);
if (!ok) {
+ /* Try to spill, if we can't reduce threading first. */
+ if (thread_index == 0) {
+ int node = v3d_choose_spill_node(c, g, temp_to_node);
+
+ if (node != -1) {
+ v3d_spill_reg(c, map[node].temp);
+ ralloc_free(g);
+
+ /* Ask the outer loop to call back in. */
+ *spilled = true;
+ return NULL;
+ }
+ }
+
free(temp_registers);
return NULL;
}
ralloc_free(g);
+ if (V3D_DEBUG & V3D_DEBUG_SHADERDB) {
+ fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d spills\n",
+ vir_get_stage_name(c),
+ c->program_id, c->variant_id,
+ c->spills);
+
+ fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d fills\n",
+ vir_get_stage_name(c),
+ c->program_id, c->variant_id,
+ c->fills);
+ }
+
return temp_registers;
}