From 0ffcb19b9d9fbe902224542047c389a661fbf816 Mon Sep 17 00:00:00 2001
From: Connor Abbott <cwabbott0@gmail.com>
Date: Fri, 19 Feb 2021 12:33:49 +0100
Subject: [PATCH] ir3: Rewrite register allocation

Switch to the new SSA-based register allocator.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9842>
---
 src/freedreno/ci/traces-freedreno.yml              |    6 +-
 src/freedreno/ir3/ir3.h                            |   35 +-
 src/freedreno/ir3/ir3_a6xx.c                       |   96 +-
 src/freedreno/ir3/ir3_compiler.c                   |    2 -
 src/freedreno/ir3/ir3_compiler.h                   |    2 -
 src/freedreno/ir3/ir3_compiler_nir.c               |   53 +-
 src/freedreno/ir3/ir3_context.c                    |    2 +-
 src/freedreno/ir3/ir3_group.c                      |  187 --
 src/freedreno/ir3/ir3_liveness.c                   |  184 ++
 src/freedreno/ir3/ir3_lower_parallelcopy.c         |  514 ++++
 src/freedreno/ir3/ir3_merge_regs.c                 |  574 ++++
 src/freedreno/ir3/ir3_postsched.c                  |   17 +-
 src/freedreno/ir3/ir3_print.c                      |   10 +
 src/freedreno/ir3/ir3_ra.c                         | 2844 +++++++++++---------
 src/freedreno/ir3/ir3_ra.h                         |  511 ++--
 src/freedreno/ir3/ir3_ra_regset.c                  |  249 --
 src/freedreno/ir3/ir3_spill.c                      |  362 +++
 src/freedreno/ir3/meson.build                      |    6 +-
 .../freedreno/ci/piglit-freedreno-a530-fails.txt   |    6 +-
 .../freedreno/ci/piglit-freedreno-a630-fails.txt   |    5 -
 20 files changed, 3551 insertions(+), 2114 deletions(-)
 delete mode 100644 src/freedreno/ir3/ir3_group.c
 create mode 100644 src/freedreno/ir3/ir3_liveness.c
 create mode 100644 src/freedreno/ir3/ir3_lower_parallelcopy.c
 create mode 100644 src/freedreno/ir3/ir3_merge_regs.c
 delete mode 100644 src/freedreno/ir3/ir3_ra_regset.c
 create mode 100644 src/freedreno/ir3/ir3_spill.c

diff --git a/src/freedreno/ci/traces-freedreno.yml b/src/freedreno/ci/traces-freedreno.yml
index 05e3e3d..dd1376d 100644
--- a/src/freedreno/ci/traces-freedreno.yml
+++ b/src/freedreno/ci/traces-freedreno.yml
@@ -435,11 +435,11 @@ traces:
   - path: minetest/minetest.trace
     expectations:
       - device: freedreno-a306
-        checksum: 9227cc8d4e6445f2323438340f2a5d9b
+        checksum: daedbc987cc1b1f934364ce6b633bc54
       - device: freedreno-a530
-        checksum: d0d655d81fabeb4087bf7c4837301f2a
+        checksum: 0054f0ba67ace5d2defe17b74b5364e9
       - device: freedreno-a630
-        checksum: c7349124612a8760ddd825b903561ec4
+        checksum: b47f8151d4310d87070deea9059d001b
   - path: neverball/neverball.trace
     expectations:
       # Skipped since it's long on a530.
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
index 6cb3895..c2733a0 100644
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -83,6 +83,17 @@ struct ir3_info {
 	uint16_t instrs_per_cat[8];
 };
 
+struct ir3_merge_set {
+	uint16_t preferred_reg;
+	uint16_t size;
+	uint16_t alignment;
+
+	unsigned interval_start;
+
+	unsigned regs_count;
+	struct ir3_register **regs;
+};
+
 struct ir3_register {
 	enum {
 		IR3_REG_CONST  = 0x001,
@@ -119,6 +130,9 @@ struct ir3_register {
 		IR3_REG_ARRAY  = 0x8000,
 
 		IR3_REG_DEST   = 0x10000,
+		IR3_REG_KILL = 0x20000,
+		IR3_REG_FIRST_KILL = 0x40000,
+		IR3_REG_UNUSED = 0x80000,
 	} flags;
 
 	/* used for cat5 instructions, but also for internal/IR level
@@ -142,6 +156,7 @@ struct ir3_register {
 	 * rN.x becomes: (N << 2) | x
 	 */
 	uint16_t num;
+	uint16_t name;
 	union {
 		/* immediate: */
 		int32_t  iim_val;
@@ -169,6 +184,10 @@ struct ir3_register {
 	 * back to a previous instruction that we depend on).
 	 */
 	struct ir3_register *def;
+
+	unsigned merge_set_offset;
+	struct ir3_merge_set *merge_set;
+	unsigned interval_start, interval_end;
 };
 
 /*
@@ -178,12 +197,12 @@ struct ir3_register {
 	unsigned name ## _count, name ## _sz; \
 	type * name;
 
-#define array_insert(ctx, arr, val) do { \
+#define array_insert(ctx, arr, ...) do { \
 		if (arr ## _count == arr ## _sz) { \
 			arr ## _sz = MAX2(2 * arr ## _sz, 16); \
 			arr = reralloc_size(ctx, arr, arr ## _sz * sizeof(arr[0])); \
 		} \
-		arr[arr ##_count++] = val; \
+		arr[arr ##_count++] = __VA_ARGS__; \
 	} while (0)
 
 struct ir3_instruction {
@@ -696,12 +715,12 @@ bool ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags);
 		set_foreach ((__instr)->uses, __entry) \
 			if ((__use = (void *)__entry->key))
 
-static inline uint32_t reg_num(struct ir3_register *reg)
+static inline uint32_t reg_num(const struct ir3_register *reg)
 {
 	return reg->num >> 2;
 }
 
-static inline uint32_t reg_comp(struct ir3_register *reg)
+static inline uint32_t reg_comp(const struct ir3_register *reg)
 {
 	return reg->num & 0x3;
 }
@@ -1479,9 +1498,6 @@ bool ir3_cp_postsched(struct ir3 *ir);
 /* Make arrays SSA */
 bool ir3_array_to_ssa(struct ir3 *ir);
 
-/* group neighbors and insert mov's to resolve conflicts: */
-bool ir3_group(struct ir3 *ir);
-
 /* scheduling: */
 bool ir3_sched_add_deps(struct ir3 *ir);
 int ir3_sched(struct ir3 *ir);
@@ -1489,11 +1505,8 @@ int ir3_sched(struct ir3 *ir);
 struct ir3_context;
 bool ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v);
 
-bool ir3_a6xx_fixup_atomic_dests(struct ir3 *ir, struct ir3_shader_variant *so);
-
 /* register assignment: */
-struct ir3_ra_reg_set * ir3_ra_alloc_reg_set(struct ir3_compiler *compiler, bool mergedregs);
-int ir3_ra(struct ir3_shader_variant *v, struct ir3_instruction **precolor, unsigned nprecolor);
+int ir3_ra(struct ir3_shader_variant *v);
 
 /* legalize: */
 bool ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary);
diff --git a/src/freedreno/ir3/ir3_a6xx.c b/src/freedreno/ir3/ir3_a6xx.c
index 8481e5b..1ce1a81 100644
--- a/src/freedreno/ir3/ir3_a6xx.c
+++ b/src/freedreno/ir3/ir3_a6xx.c
@@ -125,9 +125,9 @@ emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 	 *    src1.z  - is 'data' for cmpxchg
 	 *
 	 * The combining src and dest kinda doesn't work out so well with how
-	 * scheduling and RA work.  So for now we create a dummy src2.x, and
-	 * then in a later fixup path, insert an extra MOV out of src1.x.
-	 * See ir3_a6xx_fixup_atomic_dests().
+	 * scheduling and RA work. So we create a dummy src2 which is tied to the
+	 * destination in RA (i.e. must be allocated to the same vec2/vec3
+	 * register) and then immediately extract the first component.
 	 *
 	 * Note that nir already multiplies the offset by four
 	 */
@@ -193,7 +193,10 @@ emit_intrinsic_atomic_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 	/* even if nothing consume the result, we can't DCE the instruction: */
 	array_insert(b, b->keeps, atomic);
 
-	return atomic;
+	atomic->regs[0]->wrmask = src1->regs[0]->wrmask;
+	struct ir3_instruction *split;
+	ir3_split_dest(b, &split, atomic, 0, 1);
+	return split;
 }
 
 /* src[] = { deref, coord, sample_index }. const_index[] = {} */
@@ -270,9 +273,9 @@ emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 	 *    src1.z  - is 'value' for cmpxchg
 	 *
 	 * The combining src and dest kinda doesn't work out so well with how
-	 * scheduling and RA work.  So for now we create a dummy src2.x, and
-	 * then in a later fixup path, insert an extra MOV out of src1.x.
-	 * See ir3_a6xx_fixup_atomic_dests().
+	 * scheduling and RA work. So we create a dummy src2 which is tied to the
+	 * destination in RA (i.e. must be allocated to the same vec2/vec3
+	 * register) and then immediately extract the first component.
 	 */
 	dummy = create_immed(b, 0);
 	src0 = ir3_create_collect(ctx, coords, ncoords);
@@ -341,7 +344,10 @@ emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 	/* even if nothing consume the result, we can't DCE the instruction: */
 	array_insert(b, b->keeps, atomic);
 
-	return atomic;
+	atomic->regs[0]->wrmask = src1->regs[0]->wrmask;
+	struct ir3_instruction *split;
+	ir3_split_dest(b, &split, atomic, 0, 1);
+	return split;
 }
 
 static void
@@ -373,77 +379,3 @@ const struct ir3_context_funcs ir3_a6xx_funcs = {
 		.emit_intrinsic_image_size = emit_intrinsic_image_size,
 };
 
-/*
- * Special pass to run after instruction scheduling to insert an
- * extra mov from src1.x to dst.  This way the other compiler passes
- * can ignore this quirk of the new instruction encoding.
- *
- * This should run after RA.
- */
-
-static struct ir3_instruction *
-get_atomic_dest_mov(struct ir3_instruction *atomic)
-{
-	struct ir3_instruction *mov;
-
-	/* if we've already created the mov-out, then re-use it: */
-	if (atomic->data)
-		return atomic->data;
-
-	/* We are already out of SSA here, so we can't use the nice builders: */
-	mov = ir3_instr_create(atomic->block, OPC_MOV, 2);
-	ir3_reg_create(mov, 0, 0);    /* dst */
-	ir3_reg_create(mov, 0, 0);    /* src */
-
-	mov->cat1.src_type = TYPE_U32;
-	mov->cat1.dst_type = TYPE_U32;
-
-	/* extract back out the 'dummy' which serves as stand-in for dest: */
-	struct ir3_instruction *src = atomic->regs[3]->instr;
-	debug_assert(src->opc == OPC_META_COLLECT);
-
-	*mov->regs[0] = *atomic->regs[0];
-	*mov->regs[1] = *src->regs[1]->instr->regs[0];
-
-	mov->flags |= IR3_INSTR_SY;
-
-	/* it will have already been appended to the end of the block, which
-	 * isn't where we want it, so fix-up the location:
-	 */
-	ir3_instr_move_after(mov, atomic);
-
-	return atomic->data = mov;
-}
-
-bool
-ir3_a6xx_fixup_atomic_dests(struct ir3 *ir, struct ir3_shader_variant *so)
-{
-	bool progress = false;
-
-	if (ir3_shader_nibo(so) == 0 && !so->bindless_ibo)
-		return false;
-
-	foreach_block (block, &ir->block_list) {
-		foreach_instr (instr, &block->instr_list) {
-			instr->data = NULL;
-		}
-	}
-
-	foreach_block (block, &ir->block_list) {
-		foreach_instr_safe (instr, &block->instr_list) {
-			foreach_src (reg, instr) {
-				struct ir3_instruction *src = reg->instr;
-
-				if (!src)
-					continue;
-
-				if (is_atomic(src->opc) && (src->flags & IR3_INSTR_G)) {
-					reg->instr = get_atomic_dest_mov(src);
-					progress = true;
-				}
-			}
-		}
-	}
-
-	return progress;
-}
diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c
index 41847e1..99f895e 100644
--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@@ -78,7 +78,6 @@ ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id, bool robust_ubo_acce
 	compiler->dev = dev;
 	compiler->gpu_id = gpu_id;
 	compiler->robust_ubo_access = robust_ubo_access;
-	compiler->set = ir3_ra_alloc_reg_set(compiler, false);
 
 	/* All known GPU's have 32k local memory (aka shared) */
 	compiler->local_mem_size = 32 * 1024;
@@ -88,7 +87,6 @@ ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id, bool robust_ubo_acce
 	compiler->max_waves = 16;
 
 	if (compiler->gpu_id >= 600) {
-		compiler->mergedregs_set = ir3_ra_alloc_reg_set(compiler, true);
 		compiler->samgq_workaround = true;
 		/* a6xx split the pipeline state into geometry and fragment state, in
 		 * order to let the VS run ahead of the FS. As a result there are now
diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h
index 2366bf6..ba6737b 100644
--- a/src/freedreno/ir3/ir3_compiler.h
+++ b/src/freedreno/ir3/ir3_compiler.h
@@ -38,8 +38,6 @@ struct ir3_shader;
 struct ir3_compiler {
 	struct fd_device *dev;
 	uint32_t gpu_id;
-	struct ir3_ra_reg_set *set;
-	struct ir3_ra_reg_set *mergedregs_set;
 	uint32_t shader_count;
 
 	struct disk_cache *disk_cache;
diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c
index f63a3ce..d95fe12 100644
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -3811,13 +3811,17 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 	/* Vertex shaders in a tessellation or geometry pipeline treat END as a
 	 * NOP and has an epilogue that writes the VS outputs to local storage, to
 	 * be read by the HS.  Then it resets execution mask (chmask) and chains
-	 * to the next shader (chsh).
+	 * to the next shader (chsh). There are also a few output values which we
+	 * must send to the next stage via registers, and in order for both stages
+	 * to agree on the register used we must force these to be in specific
+	 * registers.
 	 */
 	if ((so->type == MESA_SHADER_VERTEX &&
 				(so->key.has_gs || so->key.tessellation)) ||
 			(so->type == MESA_SHADER_TESS_EVAL && so->key.has_gs)) {
 		struct ir3_instruction *outputs[3];
 		unsigned outidxs[3];
+		unsigned regids[3];
 		unsigned outputs_count = 0;
 
 		if (ctx->primitive_id) {
@@ -3828,6 +3832,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 				ir3_create_collect(ctx, &ctx->primitive_id, 1);
 			outputs[outputs_count] = out;
 			outidxs[outputs_count] = n;
+			regids[outputs_count] = regid(0, 1);
 			outputs_count++;
 		}
 
@@ -3838,6 +3843,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 				ir3_create_collect(ctx, &ctx->gs_header, 1);
 			outputs[outputs_count] = out;
 			outidxs[outputs_count] = n;
+			regids[outputs_count] = regid(0, 0);
 			outputs_count++;
 		}
 
@@ -3848,6 +3854,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 				ir3_create_collect(ctx, &ctx->tcs_header, 1);
 			outputs[outputs_count] = out;
 			outidxs[outputs_count] = n;
+			regids[outputs_count] = regid(0, 0);
 			outputs_count++;
 		}
 
@@ -3858,7 +3865,7 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 
 		__ssa_dst(chmask);
 		for (unsigned i = 0; i < outputs_count; i++)
-			__ssa_src(chmask, outputs[i], 0);
+			__ssa_src(chmask, outputs[i], 0)->num = regids[i];
 
 		chmask->end.outidxs = ralloc_array(chmask, unsigned, outputs_count);
 		memcpy(chmask->end.outidxs, outidxs, sizeof(unsigned) * outputs_count);
@@ -3959,6 +3966,8 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 	ir3_debug_print(ir, "AFTER: nir->ir3");
 	ir3_validate(ir);
 
+	IR3_PASS(ir, ir3_array_to_ssa);
+
 	do {
 		progress = false;
 
@@ -3980,11 +3989,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 
 	IR3_PASS(ir, ir3_sched_add_deps);
 
-	/* Group left/right neighbors, inserting mov's where needed to
-	 * solve conflicts:
-	 */
-	IR3_PASS(ir, ir3_group);
-
 	/* At this point, all the dead code should be long gone: */
 	assert(!IR3_PASS(ir, ir3_dce, so));
 
@@ -4012,20 +4016,12 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 			so->binning_pass;
 
 	if (pre_assign_inputs) {
-		for (unsigned i = 0; i < ctx->ninputs; i++) {
-			struct ir3_instruction *instr = ctx->inputs[i];
+		foreach_input (in, ir) {
+			assert(in->opc == OPC_META_INPUT);
+			unsigned inidx = in->input.inidx;
 
-			if (!instr)
-				continue;
-
-			unsigned n = i / 4;
-			unsigned c = i % 4;
-			unsigned regid = so->nonbinning->inputs[n].regid + c;
-
-			instr->regs[0]->num = regid;
+			in->regs[0]->num = so->nonbinning->inputs[inidx].regid;
 		}
-
-		ret = ir3_ra(so, ctx->inputs, ctx->ninputs);
 	} else if (ctx->tcs_header) {
 		/* We need to have these values in the same registers between VS and TCS
 		 * since the VS chains to TCS and doesn't get the sysvals redelivered.
@@ -4033,8 +4029,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 
 		ctx->tcs_header->regs[0]->num = regid(0, 0);
 		ctx->primitive_id->regs[0]->num = regid(0, 1);
-		struct ir3_instruction *precolor[] = { ctx->tcs_header, ctx->primitive_id };
-		ret = ir3_ra(so, precolor, ARRAY_SIZE(precolor));
 	} else if (ctx->gs_header) {
 		/* We need to have these values in the same registers between producer
 		 * (VS or DS) and GS since the producer chains to GS and doesn't get
@@ -4043,29 +4037,22 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 
 		ctx->gs_header->regs[0]->num = regid(0, 0);
 		ctx->primitive_id->regs[0]->num = regid(0, 1);
-		struct ir3_instruction *precolor[] = { ctx->gs_header, ctx->primitive_id };
-		ret = ir3_ra(so, precolor, ARRAY_SIZE(precolor));
 	} else if (so->num_sampler_prefetch) {
 		assert(so->type == MESA_SHADER_FRAGMENT);
-		struct ir3_instruction *precolor[2];
 		int idx = 0;
 
 		foreach_input (instr, ir) {
 			if (instr->input.sysval != SYSTEM_VALUE_BARYCENTRIC_PERSP_PIXEL)
 				continue;
 
-			assert(idx < ARRAY_SIZE(precolor));
-
-			precolor[idx] = instr;
+			assert(idx < 2);
 			instr->regs[0]->num = idx;
-
 			idx++;
 		}
-		ret = ir3_ra(so, precolor, idx);
-	} else {
-		ret = ir3_ra(so, NULL, 0);
 	}
 
+	ret = ir3_ra(so);
+
 	if (ret) {
 		DBG("RA failed!");
 		goto out;
@@ -4073,10 +4060,6 @@ ir3_compile_shader_nir(struct ir3_compiler *compiler,
 
 	IR3_PASS(ir, ir3_postsched, so);
 
-	if (compiler->gpu_id >= 600) {
-		IR3_PASS(ir, ir3_a6xx_fixup_atomic_dests, so);
-	}
-
 	if (so->type == MESA_SHADER_FRAGMENT)
 		pack_inlocs(ctx);
 
diff --git a/src/freedreno/ir3/ir3_context.c b/src/freedreno/ir3/ir3_context.c
index 59cc14e..0e58d3c 100644
--- a/src/freedreno/ir3/ir3_context.c
+++ b/src/freedreno/ir3/ir3_context.c
@@ -111,7 +111,7 @@ ir3_context_init(struct ir3_compiler *compiler,
 	if ((so->type == MESA_SHADER_FRAGMENT) && (compiler->gpu_id >= 600))
 		NIR_PASS_V(ctx->s, ir3_nir_lower_tex_prefetch);
 
-	NIR_PASS_V(ctx->s, nir_convert_from_ssa, true);
+	NIR_PASS(progress, ctx->s, nir_lower_phis_to_scalar, true);
 
 	/* Super crude heuristic to limit # of tex prefetch in small
 	 * shaders.  This completely ignores loops.. but that's really
diff --git a/src/freedreno/ir3/ir3_group.c b/src/freedreno/ir3/ir3_group.c
deleted file mode 100644
index 6efaf29..0000000
--- a/src/freedreno/ir3/ir3_group.c
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
- * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include "ir3.h"
-
-/*
- * Find/group instruction neighbors:
- */
-
-static void
-insert_mov(struct ir3_instruction *collect, int idx)
-{
-	struct ir3_instruction *src = ssa(collect->regs[idx+1]);
-	struct ir3_instruction *mov = ir3_MOV(src->block, src,
-		(collect->regs[idx+1]->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32);
-
-	collect->regs[idx+1]->def = mov->regs[0];
-
-	/* if collect and src are in the same block, move the inserted mov
-	 * to just before the collect to avoid a use-before-def.  Otherwise
-	 * it should be safe to leave at the end of the block it is in:
-	 */
-	if (src->block == collect->block) {
-		ir3_instr_move_before(mov, collect);
-	}
-}
-
-/* verify that cur != instr, but cur is also not in instr's neighbor-list: */
-static bool
-in_neighbor_list(struct ir3_instruction *instr, struct ir3_instruction *cur, int pos)
-{
-	int idx = 0;
-
-	if (!instr)
-		return false;
-
-	if (instr == cur)
-		return true;
-
-	for (instr = ir3_neighbor_first(instr); instr; instr = instr->cp.right)
-		if ((idx++ != pos) && (instr == cur))
-			return true;
-
-	return false;
-}
-
-static void
-group_collect(struct ir3_instruction *collect)
-{
-	struct ir3_register **regs = &collect->regs[1];
-	unsigned n = collect->regs_count - 1;
-
-	/* first pass, figure out what has conflicts and needs a mov
-	 * inserted.  Do this up front, before starting to setup
-	 * left/right neighbor pointers.  Trying to do it in a single
-	 * pass could result in a situation where we can't even setup
-	 * the mov's right neighbor ptr if the next instr also needs
-	 * a mov.
-	 */
-restart:
-	for (unsigned i = 0; i < n; i++) {
-		struct ir3_instruction *instr = ssa(regs[i]);
-		if (instr) {
-			struct ir3_instruction *left = (i > 0) ? ssa(regs[i - 1]) : NULL;
-			struct ir3_instruction *right = (i < (n-1)) ? ssa(regs[i + 1]) : NULL;
-			bool conflict;
-
-			/* check for left/right neighbor conflicts: */
-			conflict = conflicts(instr->cp.left, left) ||
-				conflicts(instr->cp.right, right);
-
-			/* Mixing array elements and higher register classes
-			 * (ie. groups) doesn't really work out in RA.  See:
-			 *
-			 * https://trello.com/c/DqeDkeVf/156-bug-with-stk-70frag
-			 */
-			if (instr->regs[0]->flags & IR3_REG_ARRAY)
-				conflict = true;
-
-			/* we also can't have an instr twice in the group: */
-			for (unsigned j = i + 1; (j < n) && !conflict; j++)
-				if (in_neighbor_list(ssa(regs[j]), instr, i))
-					conflict = true;
-
-			if (conflict) {
-				insert_mov(collect, i);
-				/* inserting the mov may have caused a conflict
-				 * against the previous:
-				 */
-				goto restart;
-			}
-		}
-	}
-
-	/* second pass, now that we've inserted mov's, fixup left/right
-	 * neighbors.  This is guaranteed to succeed, since by definition
-	 * the newly inserted mov's cannot conflict with anything.
-	 */
-	for (unsigned i = 0; i < n; i++) {
-		struct ir3_instruction *instr = ssa(regs[i]);
-		if (instr) {
-			struct ir3_instruction *left = (i > 0) ? ssa(regs[i - 1]) : NULL;
-			struct ir3_instruction *right = (i < (n-1)) ? ssa(regs[i + 1]) : NULL;
-
-			debug_assert(!conflicts(instr->cp.left, left));
-			if (left) {
-				instr->cp.left_cnt++;
-				instr->cp.left = left;
-			}
-
-			debug_assert(!conflicts(instr->cp.right, right));
-			if (right) {
-				instr->cp.right_cnt++;
-				instr->cp.right = right;
-			}
-		}
-	}
-}
-
-static bool
-instr_find_neighbors(struct ir3_instruction *instr)
-{
-	bool progress = false;
-
-	if (ir3_instr_check_mark(instr))
-		return false;
-
-	if (instr->opc == OPC_META_COLLECT) {
-		group_collect(instr);
-		progress = true;
-	}
-
-	foreach_ssa_src (src, instr)
-		progress |= instr_find_neighbors(src);
-
-	return progress;
-}
-
-static bool
-find_neighbors(struct ir3 *ir)
-{
-	bool progress = false;
-	unsigned i;
-
-	foreach_block (block, &ir->block_list) {
-		for (i = 0; i < block->keeps_count; i++) {
-			struct ir3_instruction *instr = block->keeps[i];
-			progress |= instr_find_neighbors(instr);
-		}
-
-		/* We also need to account for if-condition: */
-		if (block->condition)
-			progress |= instr_find_neighbors(block->condition);
-	}
-
-	return progress;
-}
-
-bool
-ir3_group(struct ir3 *ir)
-{
-	ir3_clear_mark(ir);
-	return find_neighbors(ir);
-}
diff --git a/src/freedreno/ir3/ir3_liveness.c b/src/freedreno/ir3/ir3_liveness.c
new file mode 100644
index 0000000..02abb0f
--- /dev/null
+++ b/src/freedreno/ir3/ir3_liveness.c
@@ -0,0 +1,184 @@
+/*
+ * Copyright (C) 2021 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ir3_ra.h"
+#include "ir3_shader.h"
+#include "ralloc.h"
+
+/* A note on how phi node uses are handled:
+ *
+ * - Phi node sources are considered to happen after the end of the
+ *   predecessor block, so the live_out for that block contains phi sources.
+ * - On the other hand, phi destinations are considered to happen at the start
+ *   of the block, so that live_in does *not* contain phi destinations. This
+ *   is mainly because phi destinations and live-through values have to be
+ *   treated very differently by RA at the beginning of a block.
+ */
+
+static bool
+compute_block_liveness(struct ir3_liveness *live, struct ir3_block *block,
+					   BITSET_WORD *tmp_live, unsigned bitset_words)
+{
+	memcpy(tmp_live, live->live_out[block->index], bitset_words *
+			sizeof(BITSET_WORD));
+
+	/* Process instructions */
+	foreach_instr_rev (instr, &block->instr_list) {
+		ra_foreach_dst(dst, instr) {
+			if (BITSET_TEST(tmp_live, dst->name))
+				dst->flags &= ~IR3_REG_UNUSED;
+			else
+				dst->flags |= IR3_REG_UNUSED;
+			BITSET_CLEAR(tmp_live, dst->name);
+		}
+
+		/* Phi node uses occur after the predecessor block */
+		if (instr->opc != OPC_META_PHI) {
+			ra_foreach_src(src, instr) {
+				if (BITSET_TEST(tmp_live, src->def->name))
+					src->flags &= ~IR3_REG_KILL;
+				else
+					src->flags |= IR3_REG_KILL;
+			}
+
+			ra_foreach_src(src, instr) {
+				if (BITSET_TEST(tmp_live, src->def->name))
+					src->flags &= ~IR3_REG_FIRST_KILL;
+				else
+					src->flags |= IR3_REG_FIRST_KILL;
+				BITSET_SET(tmp_live, src->def->name);
+			}
+		}
+	}
+
+	memcpy(live->live_in[block->index], tmp_live,
+			bitset_words * sizeof(BITSET_WORD));
+
+	bool progress = false;
+	for (unsigned i = 0; i < block->predecessors_count; i++) {
+		const struct ir3_block *pred = block->predecessors[i];
+		for (unsigned j = 0; j < bitset_words; j++) {
+			if (tmp_live[j] & ~live->live_out[pred->index][j])
+				progress = true;
+			live->live_out[pred->index][j] |= tmp_live[j];
+		}
+
+		/* Process phi sources. */
+		foreach_instr (phi, &block->instr_list) {
+			if (phi->opc != OPC_META_PHI)
+				break;
+			if (!phi->regs[1 + i]->def)
+				continue;
+			unsigned name = phi->regs[1 + i]->def->name;
+			if (!BITSET_TEST(live->live_out[pred->index], name)) {
+				progress = true;
+				BITSET_SET(live->live_out[pred->index], name);
+			}
+		}
+	}
+	
+	return progress;
+}
+
+struct ir3_liveness *ir3_calc_liveness(struct ir3_shader_variant *v)
+{
+	struct ir3_liveness *live = rzalloc(NULL, struct ir3_liveness);
+
+	/* Reserve name 0 to mean "doesn't have a name yet" to make the debug
+	 * output nicer.
+	 */
+	array_insert(live, live->definitions, NULL);
+
+	/* Build definition <-> name mapping */
+	unsigned block_count = 0;
+	foreach_block (block, &v->ir->block_list) {
+		block->index = block_count++;
+		foreach_instr (instr, &block->instr_list) {
+			ra_foreach_dst(dst, instr) {
+				dst->name = live->definitions_count;
+				array_insert(live, live->definitions, dst);
+			}
+		}
+	}
+
+	live->block_count = block_count;
+
+	unsigned bitset_words = BITSET_WORDS(live->definitions_count);
+	BITSET_WORD *tmp_live = ralloc_array(live, BITSET_WORD, bitset_words);
+	live->live_in = ralloc_array(live, BITSET_WORD *, block_count);
+	live->live_out = ralloc_array(live, BITSET_WORD *, block_count);
+	unsigned i = 0;
+	foreach_block (block, &v->ir->block_list) {
+		block->index = i++;
+		live->live_in[block->index] = rzalloc_array(live, BITSET_WORD, bitset_words);
+		live->live_out[block->index] = rzalloc_array(live, BITSET_WORD, bitset_words);
+	}
+
+	bool progress = true;
+	while (progress) {
+		progress = false;
+		foreach_block_rev (block, &v->ir->block_list) {
+			progress |=
+				compute_block_liveness(live, block, tmp_live, bitset_words);
+		}
+	}
+
+	return live;
+}
+
+/* Return true if "def" is live after "instr". It's assumed that "def"
+ * dominates "instr".
+ */
+bool
+ir3_def_live_after(struct ir3_liveness *live, struct ir3_register *def,
+				   struct ir3_instruction *instr)
+{
+	/* If it's live out then it's definitely live at the instruction. */
+	if (BITSET_TEST(live->live_out[instr->block->index], def->name))
+		return true;
+
+	/* If it's not live in and not defined in the same block then the live
+	 * range can't extend to the instruction.
+	 */
+	if (def->instr->block != instr->block &&
+		!BITSET_TEST(live->live_in[instr->block->index], def->name))
+		return false;
+
+	/* Ok, now comes the tricky case, where "def" is killed somewhere in
+	 * "instr"'s block and we have to check if it's before or after.
+	 */
+	foreach_instr_rev (test_instr, &instr->block->instr_list) {
+		if (test_instr == instr)
+			break;
+
+		for (unsigned i = 0; i < test_instr->regs_count; i++) {
+			if (test_instr->regs[i]->flags & IR3_REG_DEST)
+				continue;
+			if (test_instr->regs[i]->def == def)
+				return true;
+		}
+	}
+
+	return false;
+}
+
diff --git a/src/freedreno/ir3/ir3_lower_parallelcopy.c b/src/freedreno/ir3/ir3_lower_parallelcopy.c
new file mode 100644
index 0000000..68f1fef
--- /dev/null
+++ b/src/freedreno/ir3/ir3_lower_parallelcopy.c
@@ -0,0 +1,514 @@
+/*
+ * Copyright (C) 2021 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ir3_ra.h"
+#include "ir3_shader.h"
+
+struct copy_src {
+	unsigned flags;
+	union {
+		uint32_t imm;
+		physreg_t reg;
+		unsigned const_num;
+	};
+};
+
+struct copy_entry {
+	physreg_t dst;
+	unsigned flags;
+	bool done;
+
+	struct copy_src src;
+};
+
+static unsigned
+copy_entry_size(const struct copy_entry *entry)
+{
+	return (entry->flags & IR3_REG_HALF) ? 1 : 2;
+}
+
+static struct copy_src
+get_copy_src(const struct ir3_register *reg, unsigned offset)
+{
+	if (reg->flags & IR3_REG_IMMED) {
+		return (struct copy_src) {
+			.flags = IR3_REG_IMMED,
+			.imm = reg->uim_val,
+		};
+	} else if (reg->flags & IR3_REG_CONST) {
+		return (struct copy_src) {
+			.flags = IR3_REG_CONST,
+			.const_num = reg->num,
+		};
+	} else {
+		return (struct copy_src) {
+			.flags = 0,
+			.reg = ra_reg_get_physreg(reg) + offset,
+		};
+	}
+}
+
+static void
+do_xor(struct ir3_instruction *instr, unsigned dst_num, unsigned src1_num, unsigned src2_num, unsigned flags)
+{
+	struct ir3_instruction *xor = ir3_instr_create(instr->block, OPC_XOR_B, 3);
+	struct ir3_register *dst = ir3_reg_create(xor, dst_num, flags | IR3_REG_DEST);
+	dst->wrmask = 1;
+	struct ir3_register *src1 = ir3_reg_create(xor, src1_num, flags);
+	src1->wrmask = 1;
+	struct ir3_register *src2 = ir3_reg_create(xor, src2_num, flags);
+	src2->wrmask = 1;
+
+	ir3_instr_move_before(xor, instr);
+}
+
+static void
+do_swap(struct ir3_instruction *instr, const struct copy_entry *entry)
+{
+	assert(!entry->src.flags);
+	/* TODO implement shared swaps */
+	assert(!(entry->flags & IR3_REG_SHARED));
+
+	if (entry->flags & IR3_REG_HALF) {
+		/* We currently make sure to never emit parallel copies where the
+		 * source/destination is a half-reg above the range accessable to half
+		 * registers. However, when a full-reg source overlaps a half-reg
+		 * destination or vice versa, it can be very, very complicated to come
+		 * up with a series of "legal" swaps and copies to resolve the
+		 * parallel copy. So here we provide a fallback to implement the
+		 * "illegal" swap instead. This may also be useful for implementing
+		 * "spilling" half-regs to the inaccessable space.
+		 */
+		if (entry->src.reg >= RA_HALF_SIZE) {
+			/* Choose a temporary that doesn't overlap src or dst */
+			physreg_t tmp = entry->dst < 2 ? 2 : 0;
+
+			/* Swap src and the temporary */
+			do_swap(instr, &(struct copy_entry) {
+				.src = { .reg = entry->src.reg & ~1u },
+				.dst = tmp,
+				.flags = entry->flags & ~IR3_REG_HALF,
+			});
+
+			/* Do the original swap with src replaced with tmp */
+			do_swap(instr, &(struct copy_entry) {
+				.src = { .reg = tmp + (entry->src.reg & 1) },
+				.dst = entry->dst,
+				.flags = entry->flags,
+			});
+
+			/* Swap src and the temporary back */
+			do_swap(instr, &(struct copy_entry) {
+				.src = { .reg = entry->src.reg & ~1u },
+				.dst = tmp,
+				.flags = entry->flags & ~IR3_REG_HALF,
+			});
+			return;
+		}
+
+		/* If dst is not addressable, we only need to swap the arguments and
+		 * let the case above handle it.
+		 */
+		if (entry->dst >= RA_HALF_SIZE) {
+			do_swap(instr, &(struct copy_entry) {
+				.src = { .reg = entry->dst },
+				.dst = entry->src.reg,
+				.flags = entry->flags,
+			});
+			return;
+		}
+	}
+
+	unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
+	unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
+
+	do_xor(instr, dst_num, dst_num, src_num, entry->flags);
+	do_xor(instr, src_num, src_num, dst_num, entry->flags);
+	do_xor(instr, dst_num, dst_num, src_num, entry->flags);
+}
+
+static void
+do_copy(struct ir3_instruction *instr, const struct copy_entry *entry)
+{
+	/* TODO implement shared copies */
+	assert(!(entry->flags & IR3_REG_SHARED));
+
+	if (entry->flags & IR3_REG_HALF) {
+		/* See do_swap() for why this is here. */
+		if (entry->dst >= RA_HALF_SIZE) {
+			/* TODO: is there a hw instruction we can use for this case? */
+			physreg_t tmp = !entry->src.flags && entry->src.reg < 2 ? 2 : 0;
+
+			do_swap(instr, &(struct copy_entry) {
+				.src = { .reg = entry->dst & ~1u },
+				.dst = tmp,
+				.flags = entry->flags & ~IR3_REG_HALF,
+			});
+
+			do_copy(instr, &(struct copy_entry) {
+				.src = entry->src,
+				.dst = tmp + (entry->dst & 1),
+				.flags = entry->flags,
+			});
+
+			do_swap(instr, &(struct copy_entry) {
+				.src = { .reg = entry->dst & ~1u },
+				.dst = tmp,
+				.flags = entry->flags & ~IR3_REG_HALF,
+			});
+			return;
+		}
+
+		if (!entry->src.flags && entry->src.reg >= RA_HALF_SIZE) {
+			unsigned src_num =
+				ra_physreg_to_num(entry->src.reg & ~1u, entry->flags & ~IR3_REG_HALF);
+			unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
+			
+			if (entry->src.reg % 2 == 0) {
+				/* cov.u32u16 dst, src */
+				struct ir3_instruction *cov = ir3_instr_create(instr->block, OPC_MOV, 2);
+				ir3_reg_create(cov, dst_num, entry->flags | IR3_REG_DEST)->wrmask = 1;
+				ir3_reg_create(cov, src_num, entry->flags & ~IR3_REG_HALF)->wrmask = 1;
+				cov->cat1.dst_type = TYPE_U16;
+				cov->cat1.src_type = TYPE_U32;
+				ir3_instr_move_before(cov, instr);
+			} else {
+				/* shr.b dst, src, h(16) */
+				struct ir3_instruction *shr = ir3_instr_create(instr->block, OPC_SHR_B, 3);
+				ir3_reg_create(shr, dst_num, entry->flags | IR3_REG_DEST)->wrmask = 1;
+				ir3_reg_create(shr, src_num, entry->flags & ~IR3_REG_HALF)->wrmask = 1;
+				ir3_reg_create(shr, 0, entry->flags | IR3_REG_IMMED)->uim_val = 16;
+				ir3_instr_move_before(shr, instr);
+			}
+			return;
+		}
+	}
+
+	unsigned src_num = ra_physreg_to_num(entry->src.reg, entry->flags);
+	unsigned dst_num = ra_physreg_to_num(entry->dst, entry->flags);
+
+	struct ir3_instruction *mov = ir3_instr_create(instr->block, OPC_MOV, 2);
+	ir3_reg_create(mov, dst_num, entry->flags | IR3_REG_DEST)->wrmask = 1;
+	ir3_reg_create(mov, src_num, entry->flags | entry->src.flags)->wrmask = 1;
+	mov->cat1.dst_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+	mov->cat1.src_type = (entry->flags & IR3_REG_HALF) ? TYPE_U16 : TYPE_U32;
+	if (entry->src.flags & IR3_REG_IMMED)
+		mov->regs[1]->uim_val = entry->src.imm;
+	else if (entry->src.flags & IR3_REG_CONST)
+		mov->regs[1]->num = entry->src.const_num;
+	ir3_instr_move_before(mov, instr);
+}
+
+struct copy_ctx {
+	/* For each physreg, the number of pending copy entries that use it as a
+	 * source. Once this drops to zero, then the physreg is unblocked and can
+	 * be moved to.
+	 */
+	unsigned physreg_use_count[RA_MAX_FILE_SIZE];
+
+	/* For each physreg, the pending copy_entry that uses it as a dest. */
+	struct copy_entry *physreg_dst[RA_MAX_FILE_SIZE];
+
+	struct copy_entry entries[RA_MAX_FILE_SIZE];
+	unsigned entry_count;
+};
+
+static bool
+entry_blocked(struct copy_entry *entry, struct copy_ctx *ctx)
+{
+	for (unsigned i = 0; i < copy_entry_size(entry); i++) {
+		if (ctx->physreg_use_count[entry->dst + i] != 0)
+			return true;
+	}
+
+	return false;
+}
+
+static void
+split_32bit_copy(struct copy_ctx *ctx, struct copy_entry *entry)
+{
+	assert(!entry->done);
+	assert(!(entry->flags & (IR3_REG_IMMED | IR3_REG_CONST)));
+	assert(copy_entry_size(entry) == 2);
+	struct copy_entry *new_entry = &ctx->entries[ctx->entry_count++];
+
+	new_entry->dst = entry->dst + 1;
+	new_entry->src.flags = entry->src.flags;
+	new_entry->src.reg = entry->src.reg + 1;
+	new_entry->done = false;
+	entry->flags |= IR3_REG_HALF;
+	new_entry->flags = entry->flags;
+	ctx->physreg_dst[entry->dst + 1] = new_entry;
+}
+
+static void
+_handle_copies(struct ir3_instruction *instr, struct copy_ctx *ctx)
+{
+	/* Set up the bookkeeping */
+	memset(ctx->physreg_dst, 0, sizeof(ctx->physreg_dst));
+	memset(ctx->physreg_use_count, 0, sizeof(ctx->physreg_use_count));
+
+	for (unsigned i = 0; i < ctx->entry_count; i++) {
+		struct copy_entry *entry = &ctx->entries[i];
+		for (unsigned j = 0; j < copy_entry_size(entry); j++) {
+			if (!entry->src.flags)
+				ctx->physreg_use_count[entry->src.reg + j]++;
+
+			/* Copies should not have overlapping destinations. */
+			assert(!ctx->physreg_dst[entry->dst + j]);
+			ctx->physreg_dst[entry->dst + j] = entry;
+		}
+	}
+
+	bool progress = true;
+	while (progress) {
+		progress = false;
+
+		/* Step 1: resolve paths in the transfer graph. This means finding
+		 * copies whose destination aren't blocked by something else and then
+		 * emitting them, continuing this process until every copy is blocked
+		 * and there are only cycles left.
+		 *
+		 * TODO: We should note that src is also available in dst to unblock
+		 * cycles that src is involved in.
+		 */
+
+		for (unsigned i = 0; i < ctx->entry_count; i++) {
+			struct copy_entry *entry = &ctx->entries[i];
+			if (!entry->done && !entry_blocked(entry, ctx)) {
+				entry->done = true;
+				progress = true;
+				do_copy(instr, entry);
+				for (unsigned j = 0; j < copy_entry_size(entry); j++) {
+					if (!entry->src.flags)
+						ctx->physreg_use_count[entry->src.reg + j]--;
+					ctx->physreg_dst[entry->dst + j] = NULL;
+				}
+			}
+		}
+
+		if (progress)
+			continue;
+
+		/* Step 2: Find partially blocked copies and split them. In the
+		 * mergedregs case, we can 32-bit copies which are only blocked on one
+		 * 16-bit half, and splitting them helps get things moving.
+		 *
+		 * We can skip splitting copies if the source isn't a register,
+		 * however, because it does not unblock anything and therefore doesn't
+		 * contribute to making forward progress with step 1. These copies
+		 * should still be resolved eventually in step 1 because they can't be
+		 * part of a cycle.
+		 */
+		for (unsigned i = 0; i < ctx->entry_count; i++) {
+			struct copy_entry *entry = &ctx->entries[i];
+			if (entry->done || entry->flags & IR3_REG_HALF)
+				continue;
+
+			if (((ctx->physreg_use_count[entry->dst] == 0 ||
+				  ctx->physreg_use_count[entry->dst + 1] == 0)) &&
+				 !(entry->flags & (IR3_REG_IMMED | IR3_REG_CONST))) {
+				split_32bit_copy(ctx, entry);
+				progress = true;
+			}
+		}
+	}
+
+	/* Step 3: resolve cycles through swapping.
+	 *
+	 * At this point, the transfer graph should consist of only cycles.
+	 * The reason is that, given any physreg n_1 that's the source of a
+	 * remaining entry, it has a destination n_2, which (because every
+	 * copy is blocked) is the source of some other copy whose destination
+	 * is n_3, and so we can follow the chain until we get a cycle. If we
+	 * reached some other node than n_1:
+	 *
+	 *  n_1 -> n_2 -> ... -> n_i
+	 *          ^             |
+	 *          |-------------|
+	 *
+	 *  then n_2 would be the destination of 2 copies, which is illegal
+	 *  (checked above in an assert). So n_1 must be part of a cycle:
+	 *
+	 *  n_1 -> n_2 -> ... -> n_i
+	 *  ^                     |
+	 *  |---------------------|
+	 *
+	 *  and this must be only cycle n_1 is involved in, because any other
+	 *  path starting from n_1 would also have to end in n_1, resulting in
+	 *  a node somewhere along the way being the destination of 2 copies
+	 *  when the 2 paths merge.
+	 *
+	 *  The way we resolve the cycle is through picking a copy (n_1, n_2)
+	 *  and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken
+	 *  out of the cycle:
+	 *
+	 *  n_1 -> ... -> n_i
+	 *  ^              |
+	 *  |--------------|
+	 *
+	 *  and we can keep repeating this until the cycle is empty.
+	 */
+
+	for (unsigned i = 0; i < ctx->entry_count; i++) {
+		struct copy_entry *entry = &ctx->entries[i];
+		if (entry->done)
+			continue;
+
+		assert(!entry->src.flags);
+
+		/* catch trivial copies */
+		if (entry->dst == entry->src.reg) {
+			entry->done = true;
+			continue;
+		}
+
+		do_swap(instr, entry);
+
+		/* Split any blocking copies whose sources are only partially
+		 * contained within our destination.
+		 */
+		if (entry->flags & IR3_REG_HALF) {
+			for (unsigned j = 0; j < ctx->entry_count; j++) {
+				struct copy_entry *blocking = &ctx->entries[j];
+
+				if (blocking->done)
+					continue;
+
+				if (blocking->src.reg <= entry->dst &&
+					blocking->src.reg + 1 >= entry->dst &&
+					!(blocking->flags & IR3_REG_HALF)) {
+					split_32bit_copy(ctx, blocking);
+				}
+			}
+		}
+
+		/* Update sources of blocking copies.
+		 *
+		 * Note: at this point, every blocking copy's source should be
+		 * contained within our destination.
+		 */
+		for (unsigned j = 0; j < ctx->entry_count; j++) {
+			struct copy_entry *blocking = &ctx->entries[j];
+			if (blocking->src.reg >= entry->dst &&
+				blocking->src.reg < entry->dst + copy_entry_size(entry)) {
+				blocking->src.reg = entry->src.reg + (blocking->src.reg - entry->dst);
+			}
+		}
+	}
+}
+
+static void
+handle_copies(struct ir3_instruction *instr, struct copy_entry *entries,
+			  unsigned entry_count, bool mergedregs)
+{
+	struct copy_ctx ctx;	
+
+	if (mergedregs) {
+		/* Half regs and full regs are in the same file, so handle everything
+		 * at once.
+		 */
+		memcpy(ctx.entries, entries, sizeof(struct copy_entry) * entry_count);
+		ctx.entry_count = entry_count;
+		_handle_copies(instr, &ctx);
+	} else {
+		/* There may be both half copies and full copies, so we have to split
+		 * them up since they don't interfere.
+		 */
+		ctx.entry_count = 0;
+		for (unsigned i = 0; i < entry_count; i++) {
+			if (entries[i].flags & IR3_REG_HALF)
+				ctx.entries[ctx.entry_count++] = entries[i];
+		}
+		_handle_copies(instr, &ctx);
+
+		ctx.entry_count = 0;
+		for (unsigned i = 0; i < entry_count; i++) {
+			if (!(entries[i].flags & IR3_REG_HALF))
+				ctx.entries[ctx.entry_count++] = entries[i];
+		}
+		_handle_copies(instr, &ctx);
+	}
+}
+
+void
+ir3_lower_copies(struct ir3_shader_variant *v)
+{
+	DECLARE_ARRAY(struct copy_entry, copies);
+	copies_count = copies_sz = 0;
+	copies = NULL;
+
+	foreach_block (block, &v->ir->block_list) {
+		foreach_instr_safe (instr, &block->instr_list) {
+			if (instr->opc == OPC_META_PARALLEL_COPY) {
+				copies_count = 0;
+				for (unsigned i = 0; i < instr->regs_count / 2; i++) {
+					struct ir3_register *dst = instr->regs[i];
+					struct ir3_register *src = instr->regs[i + instr->regs_count / 2];
+					unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
+					for (unsigned j = 0; j < reg_elems(dst); j++) {
+						array_insert(NULL, copies, (struct copy_entry) {
+							.dst = ra_num_to_physreg(dst->num + j, flags),
+							.src = get_copy_src(src, j * reg_elem_size(dst)),
+							.flags = flags,
+						});
+					}
+				}
+				handle_copies(instr, copies, copies_count, v->mergedregs);
+				list_del(&instr->node);
+			} else if (instr->opc == OPC_META_COLLECT) {
+				copies_count = 0;
+				struct ir3_register *dst = instr->regs[0];
+				unsigned flags = dst->flags & (IR3_REG_HALF | IR3_REG_SHARED);
+				for (unsigned i = 1; i < instr->regs_count; i++) {
+					struct ir3_register *src = instr->regs[i];
+					array_insert(NULL, copies, (struct copy_entry) {
+						.dst = ra_num_to_physreg(dst->num + i - 1, flags),
+						.src = get_copy_src(src, 0),
+						.flags = flags,
+					});
+				}
+				handle_copies(instr, copies, copies_count, v->mergedregs);
+				list_del(&instr->node);
+			} else if (instr->opc == OPC_META_SPLIT) {
+				copies_count = 0;
+				struct ir3_register *dst = instr->regs[0];
+				struct ir3_register *src = instr->regs[1];
+				unsigned flags = src->flags & (IR3_REG_HALF | IR3_REG_SHARED);
+				array_insert(NULL, copies, (struct copy_entry) {
+					.dst = ra_reg_get_physreg(dst),
+					.src = get_copy_src(src, instr->split.off * reg_elem_size(dst)),
+					.flags = flags,
+				});
+				handle_copies(instr, copies, copies_count, v->mergedregs);
+				list_del(&instr->node);
+			} else if (instr->opc == OPC_META_PHI) {
+				list_del(&instr->node);
+			}
+		}
+	}
+
+	if (copies)
+		ralloc_free(copies);
+}
+
diff --git a/src/freedreno/ir3/ir3_merge_regs.c b/src/freedreno/ir3/ir3_merge_regs.c
new file mode 100644
index 0000000..abb8e86
--- /dev/null
+++ b/src/freedreno/ir3/ir3_merge_regs.c
@@ -0,0 +1,574 @@
+/*
+ * Copyright (C) 2021 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ir3_ra.h"
+#include "ir3_compiler.h"
+#include "ralloc.h"
+
+/* This pass "merges" compatible phi-web SSA values. First, we insert a bunch
+ * of parallelcopy's to trivially turn the program into CSSA form. Then we
+ * try to "merge" SSA def's into "merge sets" which could be allocated to a
+ * single register in order to eliminate copies. First we merge phi nodes,
+ * which should always succeed because of the parallelcopy's we inserted, and
+ * then we try to coalesce the copies we introduced.
+ *
+ * The merged registers are used for three purposes:
+ *
+ * 1. We always use the same pvtmem slot for spilling all SSA defs in each
+ * merge set. This prevents us from having to insert memory-to-memory copies
+ * in the spiller and makes sure we don't insert unecessary copies.
+ * 2. When two values are live at the same time, part of the same merge
+ * set, and they overlap each other in the merge set, they always occupy
+ * overlapping physical registers in RA. This reduces register pressure and
+ * copies in several important scenarios:
+ *	- When sources of a collect are used later by something else, we don't
+ *	have to introduce copies.
+ *	- We can handle sequences of extracts that "explode" a vector into its
+ *	components without any additional copying.
+ * 3. We use the merge sets for affinities in register allocation: That is, we
+ * try to allocate all the definitions in the same merge set to the
+ * same/compatible registers. This helps us e.g. allocate sources of a collect
+ * to contiguous registers without too much special code in RA.
+ * 
+ * In a "normal" register allocator, or when spilling, we'd just merge
+ * registers in the same merge set to the same register, but with SSA-based
+ * register allocation we may have to split the live interval.
+ *
+ * The implementation is based on "Revisiting Out-of-SSA Translation for
+ * Correctness, CodeQuality, and Eï¬iciency," and is broadly similar to the
+ * implementation in nir_from_ssa, with the twist that we also try to coalesce
+ * META_SPLIT and META_COLLECT. This makes this pass more complicated but
+ * prevents us from needing to handle these specially in RA and the spiller,
+ * which are already complicated enough. This also forces us to implement that
+ * value-comparison optimization they explain, as without it we wouldn't be
+ * able to coalesce META_SPLIT even in the simplest of cases.
+ */
+
+/* In order to dynamically reconstruct the dominance forest, we need the
+ * instructions ordered by a preorder traversal of the dominance tree:
+ */
+
+static unsigned
+index_instrs(struct ir3_block *block, unsigned index)
+{
+	foreach_instr (instr, &block->instr_list)
+		instr->ip = index++;
+
+	for (unsigned i = 0; i < block->dom_children_count; i++)
+		index = index_instrs(block->dom_children[i], index);
+
+	return index;
+}
+
+/* Definitions within a merge set are ordered by instr->ip as set above: */
+
+static bool
+def_after(struct ir3_register *a, struct ir3_register *b)
+{
+	return a->instr->ip > b->instr->ip;
+}
+
+static bool
+def_dominates(struct ir3_register *a, struct ir3_register *b)
+{
+	if (def_after(a, b)) {
+		return false;
+	} else if (a->instr->block == b->instr->block) {
+		return def_after(b, a);
+	} else {
+		return ir3_block_dominates(a->instr->block, b->instr->block);
+	}
+}
+
+/* This represents a region inside a register. The offset is relative to the
+ * start of the register, and offset + size <= size(reg).
+ */
+struct def_value {
+	struct ir3_register *reg;
+	unsigned offset, size;
+};
+
+/* Chase any copies to get the source of a region inside a register. This is
+ * Value(a) in the paper.
+ */
+static struct def_value
+chase_copies(struct def_value value)
+{
+	while (true) {
+		struct ir3_instruction *instr = value.reg->instr;
+		if (instr->opc == OPC_META_SPLIT) {
+			value.offset += instr->split.off * reg_elem_size(value.reg);
+			value.reg = instr->regs[1]->def;
+		} else if (instr->opc == OPC_META_COLLECT) {
+			if (value.offset % reg_elem_size(value.reg) != 0 ||
+				value.size > reg_elem_size(value.reg) ||
+				value.offset + value.size > reg_size(value.reg))
+				break;
+			struct ir3_register *src = instr->regs[1 + value.offset / reg_elem_size(value.reg)];
+			if (!src->def)
+				break;
+			value.offset = 0;
+			value.reg = src->def;
+		} else {
+			/* TODO: parallelcopy */
+			break;
+		}
+	}
+
+	return value;
+}
+
+/* This represents an entry in the merge set, and consists of a register +
+ * offset from the merge set base.
+ */
+struct merge_def {
+	struct ir3_register *reg;
+	unsigned offset;
+};
+
+static bool
+can_skip_interference(const struct merge_def *a, const struct merge_def *b)
+{
+	unsigned a_start = a->offset;
+	unsigned b_start = b->offset;
+	unsigned a_end = a_start + reg_size(a->reg);
+	unsigned b_end = b_start + reg_size(b->reg);
+
+	/* Registers that don't overlap never interfere */
+	if (a_end <= b_start || b_end <= a_start)
+		return true;
+
+	/* Disallow skipping interference unless one definition contains the
+	 * other. This restriction is important for register allocation, because
+	 * it means that at any given point in the program, the live values in a
+	 * given merge set will form a tree. If they didn't, then one live value
+	 * would partially overlap another, and they would have overlapping live
+	 * ranges because they're live at the same point. This simplifies register
+	 * allocation and spilling.
+	 */
+	if (!((a_start <= b_start && a_end >= b_end) ||
+		  (b_start <= a_start && b_end >= a_end)))
+		return false;
+
+	/* For each register, chase the intersection of a and b to find the
+	 * ultimate source.
+	 */
+	unsigned start = MAX2(a_start, b_start);
+	unsigned end = MIN2(a_end, b_end);
+	struct def_value a_value =
+		chase_copies((struct def_value) {
+				.reg = a->reg,
+				.offset = start - a_start,
+				.size = end - start,
+		});
+	struct def_value b_value =
+		chase_copies((struct def_value) {
+				.reg = b->reg,
+				.offset = start - b_start,
+				.size = end - start,
+		});
+	return a_value.reg == b_value.reg && a_value.offset == b_value.offset;
+}
+
+static struct ir3_merge_set *
+get_merge_set(struct ir3_register *def)
+{
+	if (def->merge_set)
+		return def->merge_set;
+
+	struct ir3_merge_set *set = ralloc(def, struct ir3_merge_set);
+	set->preferred_reg = ~0;
+	set->interval_start = ~0;
+	set->size = reg_size(def);
+	set->alignment = (def->flags & IR3_REG_HALF) ? 1 : 2;
+	set->regs_count = 1;
+	set->regs = ralloc(set, struct ir3_register *);
+	set->regs[0] = def;
+
+	return set;
+}
+
+/* Merges b into a */
+static struct ir3_merge_set *
+merge_merge_sets(struct ir3_merge_set *a, struct ir3_merge_set *b,
+			     int b_offset)
+{
+	if (b_offset < 0)
+		return merge_merge_sets(b, a, -b_offset);
+
+	struct ir3_register **new_regs =
+		rzalloc_array(a, struct ir3_register *, a->regs_count + b->regs_count);
+
+	unsigned a_index = 0, b_index = 0, new_index = 0;
+	for (; a_index < a->regs_count || b_index < b->regs_count; new_index++) {
+		if (b_index < b->regs_count &&
+			(a_index == a->regs_count ||
+			 def_after(a->regs[a_index], b->regs[b_index]))) {
+			new_regs[new_index] = b->regs[b_index++];
+			new_regs[new_index]->merge_set_offset += b_offset;
+		} else {
+			new_regs[new_index] = a->regs[a_index++];
+		}
+		new_regs[new_index]->merge_set = a;
+	}
+
+	assert(new_index == a->regs_count + b->regs_count);
+
+	/* Technically this should be the lcm, but because alignment is only 1 or
+	 * 2 so far this should be ok.
+	 */
+	a->alignment = MAX2(a->alignment, b->alignment);
+	a->regs_count += b->regs_count;
+	ralloc_free(a->regs);
+	a->regs = new_regs;
+	a->size = MAX2(a->size, b->size + b_offset);
+
+	return a;
+}
+
+static bool
+merge_sets_interfere(struct ir3_liveness *live,
+					 struct ir3_merge_set *a, struct ir3_merge_set *b,
+					 int b_offset)
+{
+	if (b_offset < 0)
+		return merge_sets_interfere(live, b, a, -b_offset);
+
+	struct merge_def dom[a->regs_count + b->regs_count];
+	unsigned a_index = 0, b_index = 0;
+	int dom_index = -1;
+
+	/* Reject trying to merge the sets if the alignment doesn't work out */
+	if (b_offset % a->alignment != 0)
+		return true;
+
+	while (a_index < a->regs_count || b_index < b->regs_count) {
+		struct merge_def current;
+		if (a_index == a->regs_count) {
+			current.reg = b->regs[b_index];
+			current.offset = current.reg->merge_set_offset + b_offset;
+			b_index++;
+		} else if (b_index == b->regs_count) {
+			current.reg = a->regs[a_index];
+			current.offset = current.reg->merge_set_offset;
+			a_index++;
+		} else {
+			if (def_after(b->regs[b_index], a->regs[a_index])) {
+				current.reg = a->regs[a_index];
+				current.offset = current.reg->merge_set_offset;
+				a_index++;
+			} else {
+				current.reg = b->regs[b_index];
+				current.offset = current.reg->merge_set_offset + b_offset;
+				b_index++;
+			}
+		}
+
+		while (dom_index >= 0 &&
+			   !def_dominates(dom[dom_index].reg, current.reg)) {
+			dom_index--;
+		}
+
+		/* TODO: in the original paper, just dom[dom_index] needs to be
+		 * checked for interference. We implement the value-chasing extension
+		 * as well as support for sub-registers, which complicates this
+		 * significantly because it's no longer the case that if a dominates b
+		 * dominates c and a and b don't interfere then we only need to check
+		 * interference between b and c to be sure a and c don't interfere --
+		 * this means we may have to check for interference against values
+		 * higher in the stack then dom[dom_index]. In the paper there's a
+		 * description of a way to do less interference tests with the
+		 * value-chasing extension, but we'd have to come up with something
+		 * ourselves for handling the similar problems that come up with
+		 * allowing values to contain subregisters. For now we just test
+		 * everything in the stack.
+		 */
+		for (int i = 0; i <= dom_index; i++) {
+			if (can_skip_interference(&current, &dom[i]))
+				continue;
+
+			/* Ok, now we actually have to check interference. Since we know
+			 * that dom[i] dominates current, this boils down to checking
+			 * whether dom[i] is live after current.
+			 */
+			if (ir3_def_live_after(live, dom[i].reg, current.reg->instr))
+				return true;
+		}
+
+		dom[++dom_index] = current;
+	}
+
+	return false;
+}
+
+static void
+try_merge_defs(struct ir3_liveness *live,
+			   struct ir3_register *a, struct ir3_register *b,
+			   unsigned b_offset)
+{
+	struct ir3_merge_set *a_set = get_merge_set(a);
+	struct ir3_merge_set *b_set = get_merge_set(b);
+
+	if (a_set == b_set) {
+		/* Note: Even in this case we may not always successfully be able to
+		 * coalesce this copy, if the offsets don't line up. But in any
+		 * case, we can't do anything.
+		 */
+		return;
+	}
+
+	int b_set_offset = a->merge_set_offset + b_offset - b->merge_set_offset;
+
+	if (!merge_sets_interfere(live, a_set, b_set, b_set_offset))
+		merge_merge_sets(a_set, b_set, b_set_offset);
+}
+
+static void
+coalesce_phi(struct ir3_liveness *live,
+			 struct ir3_instruction *phi)
+{
+	for (unsigned i = 1; i < phi->regs_count; i++) {
+		if (phi->regs[i]->def)
+			try_merge_defs(live, phi->regs[0], phi->regs[i]->def, 0);
+	}
+}
+
+static void
+aggressive_coalesce_parallel_copy(struct ir3_liveness *live,
+								  struct ir3_instruction *pcopy)
+{
+	unsigned copies = pcopy->regs_count / 2;
+	for (unsigned i = 0; i < copies; i++) {
+		if (!(pcopy->regs[copies + i]->flags & IR3_REG_SSA))
+			continue;
+		try_merge_defs(live, pcopy->regs[i], pcopy->regs[copies + i]->def, 0);
+	}
+}
+
+static void
+aggressive_coalesce_split(struct ir3_liveness *live,
+						 struct ir3_instruction *split)
+{
+	try_merge_defs(live, split->regs[1]->def, split->regs[0],
+				   split->split.off * reg_elem_size(split->regs[0]));
+}
+
+static void
+aggressive_coalesce_collect(struct ir3_liveness *live,
+						   struct ir3_instruction *collect)
+{
+	for (unsigned i = 1, offset = 0; i < collect->regs_count;
+		 offset += reg_elem_size(collect->regs[i]), i++) {
+		if (!(collect->regs[i]->flags & IR3_REG_SSA))
+			continue;
+		try_merge_defs(live, collect->regs[0], collect->regs[i]->def, offset);
+	}
+}
+
+static void
+create_parallel_copy(struct ir3_block *block)
+{
+	for (unsigned i = 0; i < 2; i++) {
+		if (!block->successors[i])
+			continue;
+
+		struct ir3_block *succ = block->successors[i];
+
+		unsigned pred_idx = ir3_block_get_pred_index(succ, block);
+
+		unsigned phi_count = 0;
+		foreach_instr (phi, &succ->instr_list) {
+			if (phi->opc != OPC_META_PHI)
+				break;
+
+			/* Avoid undef */
+			if ((phi->regs[1 + pred_idx]->flags & IR3_REG_SSA) &&
+				 !phi->regs[1 + pred_idx]->def)
+				continue;
+
+			/* We don't support critical edges. If we were to support them,
+			 * we'd need to insert parallel copies after the phi node to solve
+			 * the lost-copy problem.
+			 */
+			assert(i == 0 && !block->successors[1]);
+			phi_count++;
+		}
+
+		if (phi_count == 0)
+			continue;
+
+		struct ir3_register *src[phi_count];
+		unsigned j = 0;
+		foreach_instr (phi, &succ->instr_list) {
+			if (phi->opc != OPC_META_PHI)
+				break;
+			if ((phi->regs[1 + pred_idx]->flags & IR3_REG_SSA) &&
+				 !phi->regs[1 + pred_idx]->def)
+				continue;
+			src[j++] = phi->regs[pred_idx + 1];
+		}
+		assert(j == phi_count);
+
+		struct ir3_instruction *pcopy =
+			ir3_instr_create(block, OPC_META_PARALLEL_COPY, 2 * phi_count);
+		
+		for (j = 0; j < phi_count; j++) {
+			struct ir3_register *reg = __ssa_dst(pcopy);
+			reg->flags |= src[j]->flags & (IR3_REG_HALF | IR3_REG_ARRAY);
+			reg->size = reg_elems(src[j]);
+		}
+
+		for (j = 0; j < phi_count; j++) {
+			pcopy->regs[pcopy->regs_count++] = ir3_reg_clone(block->shader, src[j]);
+		}
+
+		j = 0;
+		foreach_instr (phi, &succ->instr_list) {
+			if (phi->opc != OPC_META_PHI)
+				break;
+			if ((phi->regs[1 + pred_idx]->flags & IR3_REG_SSA) &&
+				 !phi->regs[1 + pred_idx]->def)
+				continue;
+			phi->regs[1 + pred_idx]->def = pcopy->regs[j];
+			phi->regs[1 + pred_idx]->flags = pcopy->regs[j]->flags & ~IR3_REG_DEST;
+			j++;
+		}
+		assert(j == phi_count);
+	}
+}
+
+void
+ir3_create_parallel_copies(struct ir3 *ir)
+{
+	foreach_block (block, &ir->block_list) {
+		create_parallel_copy(block);
+	}
+}
+
+static void
+index_merge_sets(struct ir3 *ir)
+{
+	unsigned offset = 0;
+	foreach_block (block, &ir->block_list) {
+		foreach_instr (instr, &block->instr_list) {
+			for (unsigned i = 0; i < instr->regs_count; i++) {
+				struct ir3_register *dst = instr->regs[i];
+				if (!(dst->flags & IR3_REG_DEST))
+					continue;
+
+				unsigned dst_offset;
+				struct ir3_merge_set *merge_set = dst->merge_set;
+				unsigned size = reg_size(dst);
+				if (merge_set) {
+					if (merge_set->interval_start == ~0) {
+						merge_set->interval_start = offset;
+						offset += merge_set->size;
+					}
+					dst_offset = merge_set->interval_start + dst->merge_set_offset;
+				} else {
+					dst_offset = offset;
+					offset += size;
+				}
+
+				dst->interval_start = dst_offset;
+				dst->interval_end = dst_offset + size;
+			}
+		}
+	}
+}
+
+#define RESET	"\x1b[0m"
+#define BLUE	"\x1b[0;34m"
+#define SYN_SSA(x)		BLUE x RESET
+
+static void
+dump_merge_sets(struct ir3 *ir)
+{
+	printf("merge sets:\n");
+	struct set *merge_sets = _mesa_pointer_set_create(NULL);
+	foreach_block (block, &ir->block_list) {
+		foreach_instr (instr, &block->instr_list) {
+			for (unsigned i = 0; i < instr->regs_count; i++) {
+				struct ir3_register *dst = instr->regs[i];
+				if (!(dst->flags & IR3_REG_DEST))
+					continue;
+
+				struct ir3_merge_set *merge_set = dst->merge_set;
+				if (!merge_set || _mesa_set_search(merge_sets, merge_set))
+					continue;
+
+				printf("merge set, size %u, align %u:\n", merge_set->size, merge_set->alignment);
+				for (unsigned j = 0; j < merge_set->regs_count; j++) {
+					struct ir3_register *reg = merge_set->regs[j];
+					printf("\t"SYN_SSA("ssa_%u")":%u, offset %u\n", reg->instr->serialno,
+						   reg->name, reg->merge_set_offset);
+				}
+
+				_mesa_set_add(merge_sets, merge_set);
+			}
+		}
+	}
+
+	ralloc_free(merge_sets);
+}
+
+void
+ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir)
+{
+	index_instrs(ir3_start_block(ir), 0);
+
+	/* First pass: coalesce phis, which must be together. */
+	foreach_block (block, &ir->block_list) {
+		foreach_instr (instr, &block->instr_list) {
+			if (instr->opc != OPC_META_PHI)
+				break;
+
+			coalesce_phi(live, instr);
+		}
+	}
+
+	/* Second pass: aggressively coalesce parallelcopy, split, collect */
+	foreach_block (block, &ir->block_list) {
+		foreach_instr (instr, &block->instr_list) {
+			switch (instr->opc) {
+				case OPC_META_SPLIT:
+					aggressive_coalesce_split(live, instr);
+					break;
+				case OPC_META_COLLECT:
+					aggressive_coalesce_collect(live, instr);
+					break;
+				case OPC_META_PARALLEL_COPY:
+					aggressive_coalesce_parallel_copy(live, instr);
+					break;
+				default:
+					break;
+			}
+		}
+	}
+
+	index_merge_sets(ir);
+
+	if (ir3_shader_debug & IR3_DBG_RAMSGS)
+		dump_merge_sets(ir);
+}
+
diff --git a/src/freedreno/ir3/ir3_postsched.c b/src/freedreno/ir3/ir3_postsched.c
index 6a79261..3b246ed 100644
--- a/src/freedreno/ir3/ir3_postsched.c
+++ b/src/freedreno/ir3/ir3_postsched.c
@@ -728,23 +728,14 @@ cleanup_self_movs(struct ir3 *ir)
 {
 	foreach_block (block, &ir->block_list) {
 		foreach_instr_safe (instr, &block->instr_list) {
-
-			foreach_src (reg, instr) {
-				if (!reg->def)
-					continue;
-
-				if (is_self_mov(reg->def->instr)) {
-					list_delinit(&reg->def->instr->node);
-					reg->def = reg->def->instr->regs[1]->def;
-				}
-			}
-
 			for (unsigned i = 0; i < instr->deps_count; i++) {
 				if (instr->deps[i] && is_self_mov(instr->deps[i])) {
-					list_delinit(&instr->deps[i]->node);
-					instr->deps[i] = instr->deps[i]->regs[1]->def->instr;
+					instr->deps[i] = NULL;
 				}
 			}
+
+			if (is_self_mov(instr))
+				list_delinit(&instr->node);
 		}
 	}
 }
diff --git a/src/freedreno/ir3/ir3_print.c b/src/freedreno/ir3/ir3_print.c
index d1fac82..3646421 100644
--- a/src/freedreno/ir3/ir3_print.c
+++ b/src/freedreno/ir3/ir3_print.c
@@ -165,6 +165,8 @@ static void print_instr_name(struct ir3_instruction *instr, bool flags)
 static void print_ssa_def_name(struct ir3_register *reg)
 {
 	printf(SYN_SSA("ssa_%u"), reg->instr->serialno);
+		if (reg->name != 0)
+			printf(":%u", reg->name);
 }
 
 static void print_ssa_name(struct ir3_register *reg, bool dst)
@@ -177,6 +179,9 @@ static void print_ssa_name(struct ir3_register *reg, bool dst)
 	} else {
 		print_ssa_def_name(reg);
 	}
+
+	if (reg->num != INVALID_REG)
+			printf("("SYN_REG("r%u.%c")")", reg_num(reg), "xyzw"[reg_comp(reg)]);
 }
 
 static void print_reg_name(struct ir3_instruction *instr, struct ir3_register *reg)
@@ -189,6 +194,11 @@ static void print_reg_name(struct ir3_instruction *instr, struct ir3_register *r
 	else if (reg->flags & (IR3_REG_FABS | IR3_REG_SABS))
 		printf("(abs)");
 
+	if (reg->flags & IR3_REG_FIRST_KILL)
+		printf("(kill)");
+	if (reg->flags & IR3_REG_UNUSED)
+		printf("(unused)");
+
 	if (reg->flags & IR3_REG_R)
 		printf("(r)");
 
diff --git a/src/freedreno/ir3/ir3_ra.c b/src/freedreno/ir3/ir3_ra.c
index 7151a2e..a8a523f 100644
--- a/src/freedreno/ir3/ir3_ra.c
+++ b/src/freedreno/ir3/ir3_ra.c
@@ -1,4 +1,5 @@
 /*
+ * Copyright (C) 2021 Valve Corporation
  * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -19,1556 +20,1951 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
  */
 
+#include "ir3_ra.h"
+#include "ir3_shader.h"
+#include "util/rb_tree.h"
 #include "util/u_math.h"
-#include "util/register_allocate.h"
-#include "util/ralloc.h"
-#include "util/bitset.h"
 
-#include "ir3.h"
-#include "ir3_shader.h"
-#include "ir3_ra.h"
+/* This file implements an SSA-based register allocator. Unlike other
+ * SSA-based allocators, it handles vector split/collect "smartly," meaning
+ * that multiple values may share the same register interval. From the
+ * perspective of the allocator itself, only the top-level intervals matter,
+ * and the allocator is only concerned with allocating top-level intervals,
+ * which may mean moving other top-level intervals around. Other intervals,
+ * like the destination of a split instruction or the source of a collect
+ * instruction, are "locked" to their parent interval. The details of this are
+ * mostly handled by ir3_merge_regs and ir3_reg_ctx.
+ *
+ * We currently don't do any backtracking, but we do use the merge sets as a
+ * form of affinity to try to avoid moves from phis/splits/collects. Each
+ * merge set is what a more "classic" graph-coloring or live-range based
+ * allocator would consider a single register, but here we use it as merely a
+ * hint, except when multiple overlapping values are live at the same time.
+ * Each merge set has a "preferred" register, and we try to honor that when
+ * allocating values in the merge set.
+ */
 
+/* ir3_reg_ctx implementation. */
 
-#ifdef DEBUG
-#define RA_DEBUG (ir3_shader_debug & IR3_DBG_RAMSGS)
-#else
-#define RA_DEBUG 0
-#endif
-#define d(fmt, ...) do { if (RA_DEBUG) { \
-	printf("RA: "fmt"\n", ##__VA_ARGS__); \
-} } while (0)
+static int
+ir3_reg_interval_cmp(const struct rb_node *node, const void *data)
+{
+	physreg_t reg = *(const physreg_t *)data;
+	const struct ir3_reg_interval *interval = ir3_rb_node_to_interval_const(node);
+	if (interval->reg->interval_start > reg)
+		return -1;
+	else if (interval->reg->interval_end <= reg)
+		return 1;
+	else
+		return 0;
+}
 
-#define di(instr, fmt, ...) do { if (RA_DEBUG) { \
-	printf("RA: "fmt": ", ##__VA_ARGS__); \
-	ir3_print_instr(instr); \
-} } while (0)
+static struct ir3_reg_interval *
+ir3_reg_interval_search(struct rb_tree *tree, unsigned offset)
+{
+	struct rb_node *node = rb_tree_search(tree, &offset, ir3_reg_interval_cmp);
+	return node ? ir3_rb_node_to_interval(node) : NULL;
+}
 
-/*
- * Register Assignment:
- *
- * Uses the register_allocate util, which implements graph coloring
- * algo with interference classes.  To handle the cases where we need
- * consecutive registers (for example, texture sample instructions),
- * we model these as larger (double/quad/etc) registers which conflict
- * with the corresponding registers in other classes.
- *
- * Additionally we create additional classes for half-regs, which
- * do not conflict with the full-reg classes.  We do need at least
- * sizes 1-4 (to deal w/ texture sample instructions output to half-
- * reg).  At the moment we don't create the higher order half-reg
- * classes as half-reg frequently does not have enough precision
- * for texture coords at higher resolutions.
- *
- * There are some additional cases that we need to handle specially,
- * as the graph coloring algo doesn't understand "partial writes".
- * For example, a sequence like:
- *
- *   add r0.z, ...
- *   sam (f32)(xy)r0.x, ...
- *   ...
- *   sam (f32)(xyzw)r0.w, r0.x, ...  ; 3d texture, so r0.xyz are coord
- *
- * In this scenario, we treat r0.xyz as class size 3, which is written
- * (from a use/def perspective) at the 'add' instruction and ignore the
- * subsequent partial writes to r0.xy.  So the 'add r0.z, ...' is the
- * defining instruction, as it is the first to partially write r0.xyz.
- *
- * To address the fragmentation that this can potentially cause, a
- * two pass register allocation is used.  After the first pass the
- * assignment of scalars is discarded, but the assignment of vecN (for
- * N > 1) is used to pre-color in the second pass, which considers
- * only scalars.
- *
- * Arrays of arbitrary size are handled via pre-coloring a consecutive
- * sequence of registers.  Additional scalar (single component) reg
- * names are allocated starting at ctx->class_base[total_class_count]
- * (see arr->base), which are pre-colored.  In the use/def graph direct
- * access is treated as a single element use/def, and indirect access
- * is treated as use or def of all array elements.  (Only the first
- * def is tracked, in case of multiple indirect writes, etc.)
- *
- * TODO arrays that fit in one of the pre-defined class sizes should
- * not need to be pre-colored, but instead could be given a normal
- * vreg name.  (Ignoring this for now since it is a good way to work
- * out the kinks with arbitrary sized arrays.)
- *
- * TODO might be easier for debugging to split this into two passes,
- * the first assigning vreg names in a way that we could ir3_print()
- * the result.
+static struct ir3_reg_interval *
+ir3_reg_interval_search_sloppy(struct rb_tree *tree, unsigned offset)
+{
+	struct rb_node *node = rb_tree_search_sloppy(tree, &offset, ir3_reg_interval_cmp);
+	return node ? ir3_rb_node_to_interval(node) : NULL;
+}
+
+/* Get the interval covering the reg, or the closest to the right if it
+ * doesn't exist.
  */
+static struct ir3_reg_interval *
+ir3_reg_interval_search_right(struct rb_tree *tree, unsigned offset)
+{
+	struct ir3_reg_interval *interval = ir3_reg_interval_search_sloppy(tree, offset);
+	if (!interval) {
+		return NULL;
+	} else if (interval->reg->interval_end > offset) {
+		return interval;
+	} else {
+		/* There is no interval covering reg, and ra_file_search_sloppy()
+		 * returned the closest range to the left, so the next interval to the
+		 * right should be the closest to the right.
+		 */
+		return ir3_reg_interval_next_or_null(interval);
+	}
+}
+
+static int
+ir3_reg_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
+{
+	const struct ir3_reg_interval *a = ir3_rb_node_to_interval_const(_a);
+	const struct ir3_reg_interval *b = ir3_rb_node_to_interval_const(_b);
+	return b->reg->interval_start - a->reg->interval_start;
+}
 
+static void
+interval_insert(struct ir3_reg_ctx *ctx, struct rb_tree *tree,
+				struct ir3_reg_interval *interval)
+{
+	struct ir3_reg_interval *right =
+		ir3_reg_interval_search_right(tree, interval->reg->interval_start);
+	if (right && right->reg->interval_start < interval->reg->interval_end) {
+		/* We disallow trees where different members have different half-ness.
+		 * This means that we can't treat bitcasts as copies like normal
+		 * split/collect, so something like this would require an extra copy
+		 * in mergedregs mode, and count as 4 half-units of register pressure
+		 * instead of 2:
+		 *
+		 * f16vec2 foo = unpackFloat2x16(bar)
+		 * ... = foo.x
+		 * ... = bar
+		 *
+		 * However, relaxing this rule would open a huge can of worms. What
+		 * happens when there's a vector of 16 things, and the fifth element
+		 * has been bitcasted as a half-reg? Would that element alone have to
+		 * be small enough to be used as a half-reg source? Let's keep that
+		 * can of worms firmly shut for now.
+		 */
+		assert((interval->reg->flags & IR3_REG_HALF) ==
+			   (right->reg->flags & IR3_REG_HALF));
 
-static struct ir3_instruction * name_to_instr(struct ir3_ra_ctx *ctx, unsigned name);
+		if (right->reg->interval_end <= interval->reg->interval_end &&
+			right->reg->interval_start >= interval->reg->interval_start) {
+			/* Check if we're inserting something that's already inserted */
+			assert(interval != right);
 
-static bool name_is_array(struct ir3_ra_ctx *ctx, unsigned name);
-static struct ir3_array * name_to_array(struct ir3_ra_ctx *ctx, unsigned name);
+			/* "right" is contained in "interval" and must become a child of
+			 * it. There may be further children too.
+			 */
+			for (struct ir3_reg_interval *next = ir3_reg_interval_next(right);
+				 right && right->reg->interval_start < interval->reg->interval_end;
+				 right = next, next = ir3_reg_interval_next_or_null(next)) {
+				/* "right" must be contained in "interval." */
+				assert(right->reg->interval_end <= interval->reg->interval_end);
+				assert((interval->reg->flags & IR3_REG_HALF) ==
+					   (right->reg->flags & IR3_REG_HALF));
+				if (!right->parent)
+					ctx->interval_delete(ctx, right);
+				right->parent = interval;
+				rb_tree_remove(tree, &right->node);
+				rb_tree_insert(&interval->children, &right->node,
+							   ir3_reg_interval_insert_cmp);
+			}
+		} else {
+			/* "right" must contain "interval," since intervals must form a
+			 * tree.
+			 */
+			assert(right->reg->interval_start <= interval->reg->interval_start);
+			interval->parent = right;
+			interval_insert(ctx, &right->children, interval);
+			return;
+		}
+	}
 
-/* does it conflict? */
-static inline bool
-intersects(unsigned a_start, unsigned a_end, unsigned b_start, unsigned b_end)
-{
-	return !((a_start >= b_end) || (b_start >= a_end));
+	if (!interval->parent)
+		ctx->interval_add(ctx, interval);
+	rb_tree_insert(tree, &interval->node, ir3_reg_interval_insert_cmp);
+	interval->inserted = true;
 }
 
-static bool
-instr_before(struct ir3_instruction *a, struct ir3_instruction *b)
+void
+ir3_reg_interval_insert(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
 {
-	if (a->flags & IR3_INSTR_UNUSED)
-		return false;
-	return (a->ip < b->ip);
+	interval_insert(ctx, &ctx->intervals, interval);
 }
 
-static struct ir3_instruction *
-get_definer(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr,
-		int *sz, int *off)
+void
+ir3_reg_interval_remove(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
 {
-	struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
-	struct ir3_instruction *d = NULL;
+	if (interval->parent) {
+		rb_tree_remove(&interval->parent->children, &interval->node);
+	} else {
+		ctx->interval_delete(ctx, interval);
+		rb_tree_remove(&ctx->intervals, &interval->node);
+	}
+
+	rb_tree_foreach_safe(struct ir3_reg_interval, child, &interval->children, node) {
+		rb_tree_remove(&interval->children, &child->node);
+		child->parent = interval->parent;
 
-	if (ctx->scalar_pass) {
-		id->defn = instr;
-		id->off = 0;
-		id->sz = 1;     /* considering things as N scalar regs now */
+		if (interval->parent) {
+			rb_tree_insert(&child->parent->children, &child->node,
+						   ir3_reg_interval_insert_cmp);
+		} else {
+			ctx->interval_readd(ctx, interval, child);
+			rb_tree_insert(&ctx->intervals, &child->node,
+						   ir3_reg_interval_insert_cmp);
+		}
 	}
 
-	if (id->defn) {
-		*sz = id->sz;
-		*off = id->off;
-		return id->defn;
+	interval->inserted = false;
+}
+
+void
+ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *interval)
+{
+	assert(!interval->parent);
+
+	ctx->interval_delete(ctx, interval);
+	rb_tree_remove(&ctx->intervals, &interval->node);
+}
+
+static void
+interval_dump(struct ir3_reg_interval *interval, unsigned indent)
+{
+	for (unsigned i = 0; i < indent; i++)
+		printf("\t");
+	printf("reg %u start %u\n", interval->reg->name, interval->reg->interval_start);
+
+	rb_tree_foreach(struct ir3_reg_interval, child, &interval->children, node) {
+		interval_dump(child, indent + 1);
 	}
 
-	if (instr->opc == OPC_META_COLLECT) {
-		/* What about the case where collect is subset of array, we
-		 * need to find the distance between where actual array starts
-		 * and collect..  that probably doesn't happen currently.
-		 */
-		int dsz, doff;
+	for (unsigned i = 0; i < indent; i++)
+		printf("\t");
+	printf("reg %u end %u\n", interval->reg->name, interval->reg->interval_end);
+}
 
-		/* note: don't use foreach_ssa_src as this gets called once
-		 * while assigning regs (which clears SSA flag)
-		 */
-		foreach_src_n (src, n, instr) {
-			struct ir3_instruction *dd;
-			if (!src->def)
-				continue;
+void
+ir3_reg_interval_dump(struct ir3_reg_interval *interval)
+{
+	interval_dump(interval, 0);
+}
 
-			dd = get_definer(ctx, src->def->instr, &dsz, &doff);
+/* These are the core datastructures used by the register allocator. First
+ * ra_interval and ra_file, which are used for intra-block tracking and use
+ * the ir3_reg_ctx infrastructure:
+ */
 
-			if ((!d) || instr_before(dd, d)) {
-				d = dd;
-				*sz = dsz;
-				*off = doff - n;
-			}
-		}
+struct ra_interval {
+	struct ir3_reg_interval interval;
 
-	} else if (instr->cp.right || instr->cp.left) {
-		/* covers also the meta:fo case, which ends up w/ single
-		 * scalar instructions for each component:
-		 */
-		struct ir3_instruction *f = ir3_neighbor_first(instr);
+	struct rb_node physreg_node;
+	physreg_t physreg_start, physreg_end;
 
-		/* by definition, the entire sequence forms one linked list
-		 * of single scalar register nodes (even if some of them may
-		 * be splits from a texture sample (for example) instr.  We
-		 * just need to walk the list finding the first element of
-		 * the group defined (lowest ip)
-		 */
-		int cnt = 0;
+	/* True if this is a source of the current instruction which is entirely
+	 * killed. This means we can allocate the dest over it, but we can't break
+	 * it up.
+	 */
+	bool is_killed;
 
-		/* need to skip over unused in the group: */
-		while (f && (f->flags & IR3_INSTR_UNUSED)) {
-			f = f->cp.right;
-			cnt++;
-		}
+	/* True if this interval cannot be moved from its position. This is only
+	 * used for precolored inputs to ensure that other inputs don't get
+	 * allocated on top of them.
+	 */
+	bool frozen;
+};
 
-		while (f) {
-			if ((!d) || instr_before(f, d))
-				d = f;
-			if (f == instr)
-				*off = cnt;
-			f = f->cp.right;
-			cnt++;
-		}
+struct ra_file {
+	struct ir3_reg_ctx reg_ctx;
+
+	BITSET_DECLARE(available, RA_MAX_FILE_SIZE);
+	BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
+
+	struct rb_tree physreg_intervals;
+
+	unsigned size;
+	unsigned start;
+};
+
+/* State for inter-block tracking. When we split a live range to make space
+ * for a vector, we may need to insert fixup code when a block has multiple
+ * predecessors that have moved the same live value to different registers.
+ * This keeps track of state required to do that.
+ */
+
+struct ra_block_state {
+	/* Map of defining ir3_register -> physreg it was allocated to at the end
+	 * of the block.
+	 */
+	struct hash_table *renames;
+
+	/* For loops, we need to process a block before all its predecessors have
+	 * been processed. In particular, we need to pick registers for values
+	 * without knowing if all the predecessors have been renamed. This keeps
+	 * track of the registers we chose so that when we visit the back-edge we
+	 * can move them appropriately. If all predecessors have been visited
+	 * before this block is visited then we don't need to fill this out. This
+	 * is a map from ir3_register -> physreg.
+	 */
+	struct hash_table *entry_regs;
+
+	/* True if the block has been visited and "renames" is complete.
+	 */
+	bool visited;
+};
+
+struct ra_parallel_copy {
+	struct ra_interval *interval;
+	physreg_t src;
+};
+
+/* The main context: */
+
+struct ra_ctx {
+	/* r0.x - r47.w. On a6xx with merged-regs, hr0.x-hr47.w go into the bottom
+	 * half of this file too.
+	 */
+	struct ra_file full;
+
+	/* hr0.x - hr63.w, only used without merged-regs. */
+	struct ra_file half;
+
+	/* Shared regs. */
+	struct ra_file shared;
+
+	struct ir3_liveness *live;
+
+	struct ir3_block *block;
+
+	const struct ir3_compiler *compiler;
+	gl_shader_stage stage;
 
-		*sz = cnt;
+	/* Pending moves of top-level intervals that will be emitted once we're
+	 * finished:
+	 */
+	DECLARE_ARRAY(struct ra_parallel_copy, parallel_copies);
+
+	struct ra_interval *intervals;
+	struct ra_block_state *blocks;
+
+	bool merged_regs;
+};
+
+#define foreach_interval(interval, file) \
+	rb_tree_foreach(struct ra_interval, interval, &(file)->physreg_intervals, physreg_node)
+#define foreach_interval_rev(interval, file) \
+	rb_tree_foreach(struct ra_interval, interval, &(file)->physreg_intervals, physreg_node)
+#define foreach_interval_safe(interval, file) \
+	rb_tree_foreach_safe(struct ra_interval, interval, &(file)->physreg_intervals, physreg_node)
+#define foreach_interval_rev_safe(interval, file) \
+	rb_tree_foreach_rev_safe(struct ra_interval, interval, &(file)->physreg_intervals, physreg_node)
+
+static struct ra_interval *
+rb_node_to_interval(struct rb_node *node)
+{
+	return rb_node_data(struct ra_interval, node, physreg_node);
+}
+
+static const struct ra_interval *
+rb_node_to_interval_const(const struct rb_node *node)
+{
+	return rb_node_data(struct ra_interval, node, physreg_node);
+}
+
+static struct ra_interval *
+ra_interval_next(struct ra_interval *interval)
+{
+	struct rb_node *next = rb_node_next(&interval->physreg_node);
+	return next ? rb_node_to_interval(next) : NULL;
+}
 
+static struct ra_interval *
+ra_interval_next_or_null(struct ra_interval *interval)
+{
+	return interval ? ra_interval_next(interval) : NULL;
+}
+
+static int
+ra_interval_cmp(const struct rb_node *node, const void *data)
+{
+	physreg_t reg = *(const physreg_t *)data;
+	const struct ra_interval *interval = rb_node_to_interval_const(node);
+	if (interval->physreg_start > reg)
+		return -1;
+	else if (interval->physreg_end <= reg)
+		return 1;
+	else
+		return 0;
+}
+
+static struct ra_interval *
+ra_interval_search_sloppy(struct rb_tree *tree, physreg_t reg)
+{
+	struct rb_node *node = rb_tree_search_sloppy(tree, &reg, ra_interval_cmp);
+	return node ? rb_node_to_interval(node) : NULL;
+}
+
+/* Get the interval covering the reg, or the closest to the right if it
+ * doesn't exist.
+ */
+static struct ra_interval *
+ra_interval_search_right(struct rb_tree *tree, physreg_t reg)
+{
+	struct ra_interval *interval = ra_interval_search_sloppy(tree, reg);
+	if (!interval) {
+		return NULL;
+	} else if (interval->physreg_end > reg) {
+		return interval;
 	} else {
-		/* second case is looking directly at the instruction which
-		 * produces multiple values (eg, texture sample), rather
-		 * than the split nodes that point back to that instruction.
-		 * This isn't quite right, because it may be part of a larger
-		 * group, such as:
-		 *
-		 *     sam (f32)(xyzw)r0.x, ...
-		 *     add r1.x, ...
-		 *     add r1.y, ...
-		 *     sam (f32)(xyzw)r2.x, r0.w  <-- (r0.w, r1.x, r1.y)
-		 *
-		 * need to come up with a better way to handle that case.
+		/* There is no interval covering reg, and ra_file_search_sloppy()
+		 * returned the closest range to the left, so the next interval to the
+		 * right should be the closest to the right.
 		 */
-		if (instr->address) {
-			*sz = instr->regs[0]->size;
-		} else {
-			*sz = util_last_bit(instr->regs[0]->wrmask);
-		}
-		*off = 0;
-		d = instr;
+		return ra_interval_next_or_null(interval);
 	}
+}
+
+static struct ra_interval *
+ra_file_search_right(struct ra_file *file, physreg_t reg)
+{
+	return ra_interval_search_right(&file->physreg_intervals, reg);
+}
+
+static int
+ra_interval_insert_cmp(const struct rb_node *_a, const struct rb_node *_b)
+{
+	const struct ra_interval *a = rb_node_to_interval_const(_a);
+	const struct ra_interval *b = rb_node_to_interval_const(_b);
+	return b->physreg_start - a->physreg_start;
+}
+
+static struct ra_interval *
+ir3_reg_interval_to_ra_interval(struct ir3_reg_interval *interval)
+{
+	return rb_node_data(struct ra_interval, interval, interval);
+}
 
-	if (d->opc == OPC_META_SPLIT) {
-		struct ir3_instruction *dd;
-		int dsz, doff;
+static struct ra_file *
+ir3_reg_ctx_to_file(struct ir3_reg_ctx *ctx)
+{
+	return rb_node_data(struct ra_file, ctx, reg_ctx);
+}
 
-		dd = get_definer(ctx, d->regs[1]->def->instr, &dsz, &doff);
+static void
+interval_add(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
+{
+	struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
+	struct ra_file *file = ir3_reg_ctx_to_file(ctx);
 
-		/* by definition, should come before: */
-		ra_assert(ctx, instr_before(dd, d));
+	/* We can assume in this case that physreg_start/physreg_end is already
+	 * initialized.
+	 */
+	for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
+		BITSET_CLEAR(file->available, i);
+		BITSET_CLEAR(file->available_to_evict, i);
+	}
 
-		*sz = MAX2(*sz, dsz);
+	rb_tree_insert(&file->physreg_intervals, &interval->physreg_node,
+				   ra_interval_insert_cmp);
+}
 
-		if (instr->opc == OPC_META_SPLIT)
-			*off = MAX2(*off, instr->split.off);
+static void
+interval_delete(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_interval)
+{
+	struct ra_interval *interval = ir3_reg_interval_to_ra_interval(_interval);
+	struct ra_file *file = ir3_reg_ctx_to_file(ctx);
 
-		d = dd;
+	for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
+		BITSET_SET(file->available, i);
+		BITSET_SET(file->available_to_evict, i);
 	}
 
-	ra_assert(ctx, d->opc != OPC_META_SPLIT);
+	rb_tree_remove(&file->physreg_intervals, &interval->physreg_node);
+}
+
+static void
+interval_readd(struct ir3_reg_ctx *ctx, struct ir3_reg_interval *_parent,
+			   struct ir3_reg_interval *_child)
+{
+	struct ra_interval *parent = ir3_reg_interval_to_ra_interval(_parent);
+	struct ra_interval *child = ir3_reg_interval_to_ra_interval(_child);
 
-	id->defn = d;
-	id->sz = *sz;
-	id->off = *off;
+	child->physreg_start = parent->physreg_start +
+		(child->interval.reg->interval_start - parent->interval.reg->interval_start);
+	child->physreg_end = child->physreg_start +
+		(child->interval.reg->interval_end - child->interval.reg->interval_start);
 
-	return d;
+	interval_add(ctx, _child);
 }
 
+
 static void
-ra_block_find_definers(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+ra_file_init(struct ra_file *file)
 {
-	foreach_instr (instr, &block->instr_list) {
-		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
-		if (instr->regs_count == 0)
-			continue;
-		/* couple special cases: */
-		if (writes_addr0(instr) || writes_addr1(instr) || writes_pred(instr)) {
-			id->cls = -1;
-		} else if (instr->regs[0]->flags & IR3_REG_ARRAY) {
-			id->cls = total_class_count;
-		} else {
-			/* and the normal case: */
-			id->defn = get_definer(ctx, instr, &id->sz, &id->off);
-			id->cls = ra_size_to_class(id->sz, is_half(id->defn), is_shared(id->defn));
-
-			/* this is a bit of duct-tape.. if we have a scenario like:
-			 *
-			 *   sam (f32)(x) out.x, ...
-			 *   sam (f32)(x) out.y, ...
-			 *
-			 * Then the fanout/split meta instructions for the two different
-			 * tex instructions end up grouped as left/right neighbors.  The
-			 * upshot is that in when you get_definer() on one of the meta:fo's
-			 * you get definer as the first sam with sz=2, but when you call
-			 * get_definer() on the either of the sam's you get itself as the
-			 * definer with sz=1.
-			 *
-			 * (We actually avoid this scenario exactly, the neighbor links
-			 * prevent one of the output mov's from being eliminated, so this
-			 * hack should be enough.  But probably we need to rethink how we
-			 * find the "defining" instruction.)
-			 *
-			 * TODO how do we figure out offset properly...
-			 */
-			if (id->defn != instr) {
-				struct ir3_ra_instr_data *did = &ctx->instrd[id->defn->ip];
-				if (did->sz < id->sz) {
-					did->sz = id->sz;
-					did->cls = id->cls;
-				}
-			}
-		}
+	for (unsigned i = 0; i < file->size; i++) {
+		BITSET_SET(file->available, i);
+		BITSET_SET(file->available_to_evict, i);
 	}
+
+	file->start = 0;
+
+	rb_tree_init(&file->reg_ctx.intervals);
+	rb_tree_init(&file->physreg_intervals);
+
+	file->reg_ctx.interval_add = interval_add;
+	file->reg_ctx.interval_delete = interval_delete;
+	file->reg_ctx.interval_readd = interval_readd;
 }
 
-/* give each instruction a name (and ip), and count up the # of names
- * of each class
- */
 static void
-ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+ra_file_insert(struct ra_file *file, struct ra_interval *interval)
 {
-	foreach_instr (instr, &block->instr_list) {
-		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+	assert(interval->physreg_start < interval->physreg_end);
+	assert(interval->physreg_end <= file->size);
+	if (interval->interval.reg->flags & IR3_REG_HALF)
+		assert(interval->physreg_end <= RA_HALF_SIZE);
 
-#ifdef DEBUG
-		instr->name = ~0;
-#endif
+	ir3_reg_interval_insert(&file->reg_ctx, &interval->interval);
+}
 
-		ctx->instr_cnt++;
+static void
+ra_file_remove(struct ra_file *file, struct ra_interval *interval)
+{
+	ir3_reg_interval_remove(&file->reg_ctx, &interval->interval);
+}
 
-		if (!writes_gpr(instr))
-			continue;
+static void
+ra_file_mark_killed(struct ra_file *file, struct ra_interval *interval)
+{
+	assert(!interval->interval.parent);
 
-		if (id->defn != instr)
-			continue;
+	for (physreg_t i = interval->physreg_start; i < interval->physreg_end; i++) {
+		BITSET_SET(file->available, i);
+	}
 
-		/* In scalar pass, collect/split don't get their own names,
-		 * but instead inherit them from their src(s):
-		 *
-		 * Possibly we don't need this because of scalar_name(), but
-		 * it does make the ir3_print() dumps easier to read.
-		 */
-		if (ctx->scalar_pass) {
-			if (instr->opc == OPC_META_SPLIT) {
-				instr->name = instr->regs[1]->def->instr->name + instr->split.off;
-				continue;
-			}
+	interval->is_killed = true;
+}
 
-			if (instr->opc == OPC_META_COLLECT) {
-				instr->name = instr->regs[1]->def->instr->name;
-				continue;
-			}
-		}
+static physreg_t
+ra_interval_get_physreg(const struct ra_interval *interval)
+{
+	unsigned child_start = interval->interval.reg->interval_start;
 
-		/* arrays which don't fit in one of the pre-defined class
-		 * sizes are pre-colored:
-		 */
-		if ((id->cls >= 0) && (id->cls < total_class_count)) {
-			/* in the scalar pass, we generate a name for each
-			 * scalar component, instr->name is the name of the
-			 * first component.
-			 */
-			unsigned n = ctx->scalar_pass ? dest_regs(instr) : 1;
-			instr->name = ctx->class_alloc_count[id->cls];
-			ctx->class_alloc_count[id->cls] += n;
-			ctx->alloc_count += n;
-		}
+	while (interval->interval.parent) {
+		interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
 	}
+
+	return interval->physreg_start +
+		(child_start - interval->interval.reg->interval_start);
+}
+
+static unsigned
+ra_interval_get_num(const struct ra_interval *interval)
+{
+	return ra_physreg_to_num(ra_interval_get_physreg(interval),
+							 interval->interval.reg->flags);
 }
 
-/**
- * Set a value for max register target.
- *
- * Currently this just rounds up to a multiple of full-vec4 (ie. the
- * granularity that we configure the hw for.. there is no point to
- * using r3.x if you aren't going to make r3.yzw available).  But
- * in reality there seems to be multiple thresholds that affect the
- * number of waves.. and we should round up the target to the next
- * threshold when we round-robin registers, to give postsched more
- * options.  When we understand that better, this is where we'd
- * implement that.
- */
 static void
-ra_set_register_target(struct ir3_ra_ctx *ctx, unsigned max_target)
+ra_interval_init(struct ra_interval *interval, struct ir3_register *reg)
 {
-	const unsigned hvec4 = 4;
-	const unsigned vec4 = 2 * hvec4;
+	ir3_reg_interval_init(&interval->interval, reg);
+	interval->is_killed = false;
+	interval->frozen = false;
+}
 
-	ctx->max_target = align(max_target, vec4);
+static void
+ra_interval_dump(struct ra_interval *interval)
+{
+	printf("physreg %u ", interval->physreg_start);
 
-	d("New max_target=%u", ctx->max_target);
+	ir3_reg_interval_dump(&interval->interval);
 }
 
-static int
-pick_in_range(BITSET_WORD *regs, unsigned min, unsigned max)
+static void
+ra_file_dump(struct ra_file *file)
 {
-	for (unsigned i = min; i <= max; i++) {
-		if (BITSET_TEST(regs, i)) {
-			return i;
-		}
+	rb_tree_foreach(struct ra_interval, interval, &file->physreg_intervals, physreg_node) {
+		ra_interval_dump(interval);
 	}
-	return -1;
+
+	unsigned start, end;
+	printf("available:\n");
+	BITSET_FOREACH_RANGE(start, end, file->available, file->size) {
+		printf("%u-%u ", start, end);
+	}
+	printf("\n");
+
+	printf("available to evict:\n");
+	BITSET_FOREACH_RANGE(start, end, file->available_to_evict, file->size) {
+		printf("%u-%u ", start, end);
+	}
+	printf("\n");
+	printf("start: %u\n", file->start);
 }
 
-static int
-pick_in_range_rev(BITSET_WORD *regs, int min, int max)
+static void
+ra_ctx_dump(struct ra_ctx *ctx)
 {
-	for (int i = max; i >= min; i--) {
-		if (BITSET_TEST(regs, i)) {
-			return i;
-		}
+	printf("full:\n");
+	ra_file_dump(&ctx->full);
+	printf("half:\n");
+	ra_file_dump(&ctx->half);
+	printf("shared:\n");
+	ra_file_dump(&ctx->shared);
+}
+
+static unsigned
+reg_file_size(struct ra_file *file, struct ir3_register *reg)
+{
+	/* Half-regs can only take up the first half of the combined regfile */
+	if (reg->flags & IR3_REG_HALF)
+		return MIN2(file->size, RA_HALF_SIZE);
+	else
+		return file->size;
+}
+
+/* ra_pop_interval/ra_push_interval provide an API to shuffle around multiple
+ * top-level intervals at once. Pop multiple intervals, then push them back in
+ * any order.
+ */
+
+struct ra_removed_interval {
+	struct ra_interval *interval;
+	unsigned size;
+};
+
+static struct ra_removed_interval
+ra_pop_interval(struct ra_ctx *ctx, struct ra_file *file,
+		        struct ra_interval *interval)
+{
+	assert(!interval->interval.parent);
+
+	/* Check if we've already moved this reg before */
+	unsigned pcopy_index;
+	for (pcopy_index = 0; pcopy_index < ctx->parallel_copies_count; pcopy_index++) {
+		if (ctx->parallel_copies[pcopy_index].interval == interval)
+			break;
+	}
+
+	if (pcopy_index == ctx->parallel_copies_count) {
+		array_insert(ctx, ctx->parallel_copies, (struct ra_parallel_copy) {
+				.interval = interval,
+				.src = interval->physreg_start,
+		});
 	}
-	return -1;
+
+	ir3_reg_interval_remove_all(&file->reg_ctx, &interval->interval);
+
+	return (struct ra_removed_interval) {
+		.interval = interval,
+		.size = interval->physreg_end - interval->physreg_start,
+	};
 }
 
-/* register selector for the a6xx+ merged register file: */
-static unsigned int
-ra_select_reg_merged(unsigned int n, BITSET_WORD *regs, void *data)
+static void
+ra_push_interval(struct ra_ctx *ctx, struct ra_file *file,
+				 const struct ra_removed_interval *removed, physreg_t dst)
 {
-	struct ir3_ra_ctx *ctx = data;
-	struct ra_class *classp = ra_get_node_class(ctx->g, n);
-	unsigned int class = ra_class_index(classp);
-	bool half, shared;
-	int sz = ra_class_to_size(class, &half, &shared);
+	struct ra_interval *interval = removed->interval;
 
-	assert (sz > 0);
+	interval->physreg_start = dst;
+	interval->physreg_end = dst + removed->size;
 
-	/* dimensions within the register class: */
-	unsigned max_target, start;
+	ir3_reg_interval_insert(&file->reg_ctx, &interval->interval);
+}
 
-	/* the regs bitset will include *all* of the virtual regs, but we lay
-	 * out the different classes consecutively in the virtual register
-	 * space.  So we just need to think about the base offset of a given
-	 * class within the virtual register space, and offset the register
-	 * space we search within by that base offset.
-	 */
-	unsigned base;
-
-	/* TODO I think eventually we want to round-robin in vector pass
-	 * as well, but needs some more work to calculate # of live vals
-	 * for this.  (Maybe with some work, we could just figure out
-	 * the scalar target and use that, since that is what we care
-	 * about in the end.. but that would mean setting up use-def/
-	 * liveranges for scalar pass before doing vector pass.)
-	 *
-	 * For now, in the vector class, just move assignments for scalar
-	 * vals higher to hopefully prevent them from limiting where vecN
-	 * values can be placed.  Since the scalar values are re-assigned
-	 * in the 2nd pass, we don't really care where they end up in the
-	 * vector pass.
-	 */
-	if (!ctx->scalar_pass) {
-		base = ctx->set->gpr_to_ra_reg[class][0];
-		if (shared) {
-			max_target = SHARED_CLASS_REGS(class - SHARED_OFFSET);
-		} else if (half) {
-			max_target = HALF_CLASS_REGS(class - HALF_OFFSET);
-		} else {
-			max_target = CLASS_REGS(class);
+/* Pick up the interval and place it at "dst". */
+static void
+ra_move_interval(struct ra_ctx *ctx, struct ra_file *file,
+				 struct ra_interval *interval, physreg_t dst)
+{
+	struct ra_removed_interval temp = ra_pop_interval(ctx, file, interval);
+	ra_push_interval(ctx, file, &temp, dst);
+}
+
+static bool
+get_reg_specified(struct ra_file *file, struct ir3_register *reg, physreg_t physreg, bool is_source)
+{
+	for (unsigned i = 0; i < reg_size(reg); i++) {
+		if (!BITSET_TEST(is_source ? file->available_to_evict : file->available, physreg + i))
+			return false;
+	}
+
+	return true;
+}
+
+/* Try to evict any registers conflicting with the proposed spot "physreg" for
+ * "reg". That is, move them to other places so that we can allocate "physreg"
+ * here.
+ */
+
+static bool
+try_evict_regs(struct ra_ctx *ctx, struct ra_file *file,
+		       struct ir3_register *reg, physreg_t physreg,
+			   unsigned *_eviction_count, bool is_source, bool speculative)
+{
+	BITSET_DECLARE(available_to_evict, RA_MAX_FILE_SIZE);
+	memcpy(available_to_evict, file->available_to_evict, sizeof(available_to_evict));
+
+	for (unsigned i = 0; i < reg_size(reg); i++)
+		BITSET_CLEAR(available_to_evict, physreg + i);
+	
+	unsigned eviction_count = 0;
+	/* Iterate over each range conflicting with physreg */
+	for (struct ra_interval *conflicting = ra_file_search_right(file, physreg),
+		 *next = ra_interval_next_or_null(conflicting);
+		 conflicting != NULL && conflicting->physreg_start < physreg + reg_size(reg);
+		 conflicting = next, next = ra_interval_next_or_null(next)) {
+		if (!is_source && conflicting->is_killed)
+			continue;
+
+		if (conflicting->frozen) {
+			assert(speculative);
+			return false;
 		}
 
-		if ((sz == 1) && !shared) {
-			return pick_in_range_rev(regs, base, base + max_target);
-		} else {
-			return pick_in_range(regs, base, base + max_target);
+		unsigned avail_start, avail_end;
+		bool evicted = false;
+		BITSET_FOREACH_RANGE(avail_start, avail_end, available_to_evict,
+							 reg_file_size(file, conflicting->interval.reg)) {
+			unsigned size = avail_end - avail_start;
+
+			/* non-half registers must be aligned */
+			if (!(conflicting->interval.reg->flags & IR3_REG_HALF) && avail_start % 2 == 1) {
+				avail_start++;
+				size--;
+			}
+
+			if (size >= conflicting->physreg_end - conflicting->physreg_start) {
+				for (unsigned i = 0; i < conflicting->physreg_end - conflicting->physreg_start; i++)
+					BITSET_CLEAR(available_to_evict, avail_start + i);
+				eviction_count += conflicting->physreg_end - conflicting->physreg_start;
+				if (!speculative)
+					ra_move_interval(ctx, file, conflicting, avail_start);
+				evicted = true;
+				break;
+			}
 		}
-	} else {
-		ra_assert(ctx, sz == 1);
+
+		if (!evicted)
+			return false;
 	}
 
-	/* NOTE: this is only used in scalar pass, so the register
-	 * class will be one of the scalar classes (ie. idx==0):
+	*_eviction_count = eviction_count;
+	return true;
+}
+
+static int removed_interval_cmp(const void *_i1, const void *_i2)
+{
+	const struct ra_removed_interval *i1 = _i1;
+	const struct ra_removed_interval *i2 = _i2;
+
+	/* We sort the registers as follows:
+	 *
+	 * |--------------------------------------------------------------------|
+	 * |                    |             |             |                   |
+	 * |  Half live-through | Half killed | Full killed | Full live-through |
+	 * |                    |             |             |                   |
+	 * |--------------------------------------------------------------------|
+	 *                        |                 |
+	 *                        |   Destination   |
+	 *                        |                 |
+	 *                        |-----------------|
+	 *
+	 * Half-registers have to be first so that they stay in the low half of
+	 * the register file. Then half and full killed must stay together so that
+	 * there's a contiguous range where we can put the register. With this
+	 * structure we should be able to accomodate any collection of intervals
+	 * such that the total number of half components is within the half limit
+	 * and the combined components are within the full limit.
 	 */
-	base = ctx->set->gpr_to_ra_reg[class][0];
-	if (shared) {
-		max_target = SHARED_CLASS_REGS(0);
-		start = 0;
-	} else if (half) {
-		max_target = ctx->max_target;
-		start = ctx->start_search_reg;
+
+	unsigned i1_align = reg_elem_size(i1->interval->interval.reg);
+	unsigned i2_align = reg_elem_size(i2->interval->interval.reg);
+	if (i1_align > i2_align)
+		return 1;
+	if (i1_align < i2_align)
+		return -1;
+
+	if (i1_align == 1) {
+		if (i2->interval->is_killed)
+			return -1;
+		if (i1->interval->is_killed)
+			return 1;
 	} else {
-		max_target = ctx->max_target / 2;
-		start = ctx->start_search_reg;
+		if (i2->interval->is_killed)
+			return 1;
+		if (i1->interval->is_killed)
+			return -1;
 	}
 
-	/* For cat4 instructions, if the src reg is already assigned, and
-	 * avail to pick, use it.  Because this doesn't introduce unnecessary
-	 * dependencies, and it potentially avoids needing (ss) syncs to
-	 * for write after read hazards:
-	 */
-	struct ir3_instruction *instr = name_to_instr(ctx, n);
-	if (is_sfu(instr)) {
-		struct ir3_register *src = instr->regs[1];
-		int src_n;
-
-		if ((src->flags & IR3_REG_ARRAY) && !(src->flags & IR3_REG_RELATIV)) {
-			struct ir3_array *arr = ir3_lookup_array(ctx->ir, src->array.id);
-			src_n = arr->base + src->array.offset;
+	return 0;
+}
+
+/* "Compress" all the live intervals so that there is enough space for the
+ * destination register. As there can be gaps when a more-aligned interval
+ * follows a less-aligned interval, this also sorts them to remove such
+ * "padding", which may be required when space is very tight.  This isn't
+ * amazing, but should be used only as a last resort in case the register file
+ * is almost full and badly fragmented.
+ *
+ * Return the physreg to use.
+ */
+static physreg_t
+compress_regs_left(struct ra_ctx *ctx, struct ra_file *file, unsigned size,
+				   unsigned align, bool is_source)
+{
+	DECLARE_ARRAY(struct ra_removed_interval, intervals);
+	intervals_count = intervals_sz = 0;
+	intervals = NULL;
+
+	unsigned removed_full_size = 0;
+	unsigned removed_half_size = 0;
+	unsigned file_size = align == 1 ? MIN2(file->size, RA_HALF_SIZE) : file->size;
+	physreg_t start_reg = 0;
+
+	foreach_interval_rev_safe(interval, file) {
+		/* Check if we can sort the intervals *after* this one and have
+		 * enough space leftover to accomodate "size" units.
+		 */
+		if (align == 1) {
+			if (interval->physreg_end + removed_half_size <= file_size - size) {
+				start_reg = interval->physreg_end;
+				break;
+			}
 		} else {
-			src_n = scalar_name(ctx, src->def->instr, 0);
+			if (interval->physreg_end + removed_half_size <= file_size -
+					removed_full_size - size) {
+				start_reg = interval->physreg_end;
+				break;
+			}
 		}
 
-		unsigned reg = ra_get_node_reg(ctx->g, src_n);
+		/* We assume that all frozen intervals are at the start and that we
+		 * can avoid popping them.
+		 */
+		assert(!interval->frozen);
 
-		/* Check if the src register has been assigned yet: */
-		if (reg != NO_REG) {
-			if (BITSET_TEST(regs, reg)) {
-				return reg;
-			}
+		/* Killed sources don't count because they go at the end and can
+		 * overlap the register we're trying to add.
+		 */
+		if (!interval->is_killed && !is_source) {
+			if (interval->interval.reg->flags & IR3_REG_HALF)
+				removed_half_size += interval->physreg_end - interval->physreg_start;
+			else
+				removed_full_size += interval->physreg_end - interval->physreg_start;
 		}
-	}
 
-	int r = pick_in_range(regs, base + start, base + max_target);
-	if (r < 0) {
-		/* wrap-around: */
-		r = pick_in_range(regs, base, base + start);
+		/* Now that we've done the accounting, pop this off */
+		d("popping interval %u physreg %u\n", interval->interval.reg->name, interval->physreg_start);
+		array_insert(ctx, intervals, ra_pop_interval(ctx, file, interval));
 	}
 
-	if (r < 0) {
-		/* overflow, we need to increase max_target: */
-		ra_set_register_target(ctx, ctx->max_target + 1);
-		return ra_select_reg_merged(n, regs, data);
+	/* TODO: In addition to skipping registers at the beginning that are
+	 * well-packed, we should try to skip registers at the end.
+	 */
+
+	qsort(intervals, intervals_count, sizeof(*intervals), removed_interval_cmp);
+
+	physreg_t physreg = start_reg;
+	physreg_t ret_reg = (physreg_t) ~0;
+	for (unsigned i = 0; i < intervals_count; i++) {
+		if (ret_reg == (physreg_t) ~0 &&
+			((intervals[i].interval->is_killed && !is_source) ||
+			!(intervals[i].interval->interval.reg->flags & IR3_REG_HALF))) {
+			ret_reg = ALIGN(physreg, align);
+		}
+
+		if (ret_reg != (physreg_t) ~0 &&
+			(is_source || !intervals[i].interval->is_killed)) {
+			physreg = MAX2(physreg, ret_reg + size);
+		}
+
+		if (!(intervals[i].interval->interval.reg->flags & IR3_REG_HALF)) {
+			physreg = ALIGN(physreg, 2);
+		}
+
+		if (physreg + intervals[i].size >
+			reg_file_size(file, intervals[i].interval->interval.reg)) {
+			d("ran out of room for interval %u!\n", intervals[i].interval->interval.reg->name);
+			unreachable("reg pressure calculation was wrong!");
+			return 0;
+		}
+
+		d("pushing interval %u physreg %u\n", intervals[i].interval->interval.reg->name, physreg);
+		ra_push_interval(ctx, file, &intervals[i], physreg);
+
+		physreg += intervals[i].size;
 	}
 
-	if (classp == ctx->set->half_classes[0]) {
-		int n = r - base;
-		ctx->start_search_reg = (n + 1) % ctx->max_target;
-	} else if (classp == ctx->set->classes[0]) {
-		int n = (r - base) * 2;
-		ctx->start_search_reg = (n + 1) % ctx->max_target;
+	if (ret_reg == (physreg_t) ~0)
+		ret_reg = physreg;
+
+	ret_reg = ALIGN(ret_reg, align);
+	if (ret_reg + size > file_size) {
+		d("ran out of room for the new interval!\n");
+		unreachable("reg pressure calculation was wrong!");
+		return 0;
 	}
 
-	return r;
+	return ret_reg;
 }
 
 static void
-ra_init(struct ir3_ra_ctx *ctx)
+update_affinity(struct ir3_register *reg, physreg_t physreg)
+{
+	if (!reg->merge_set || reg->merge_set->preferred_reg != (physreg_t) ~0)
+		return;
+
+	if (physreg < reg->merge_set_offset)
+		return;
+
+	reg->merge_set->preferred_reg = physreg - reg->merge_set_offset;
+}
+
+/* Try to find free space for a register without shuffling anything. This uses
+ * a round-robin algorithm to reduce false dependencies.
+ */
+static physreg_t
+find_best_gap(struct ra_file *file, unsigned file_size,
+		      unsigned size, unsigned align, bool is_source)
 {
-	unsigned n, base;
+	BITSET_WORD *available = is_source ? file->available_to_evict : file->available;
+
+	unsigned start = ALIGN(file->start, align) % (file_size - size + align);
+	unsigned candidate = start;
+	do {
+		bool is_available = true;
+		for (unsigned i = 0; i < size; i++) {
+			if (!BITSET_TEST(available, candidate + i)) {
+				is_available = false;
+				break;
+			}
+		}
 
-	ir3_clear_mark(ctx->ir);
-	n = ir3_count_instructions_ra(ctx->ir);
+		if (is_available) {
+			file->start = (candidate + size) % file_size;
+			return candidate;
+		}
 
-	ctx->instrd = rzalloc_array(NULL, struct ir3_ra_instr_data, n);
+		candidate += align;
+		if (candidate + size > file_size)
+			candidate = 0;
+	} while (candidate != start);
+	
+	return (physreg_t) ~0;
+}
 
-	foreach_block (block, &ctx->ir->block_list) {
-		ra_block_find_definers(ctx, block);
-	}
+static struct ra_file *
+ra_get_file(struct ra_ctx *ctx, struct ir3_register *reg)
+{
+	if (reg->flags & IR3_REG_SHARED)
+		return &ctx->shared;
+	else if (ctx->merged_regs || !(reg->flags & IR3_REG_HALF))
+		return &ctx->full;
+	else
+		return &ctx->half;
+}
 
-	foreach_block (block, &ctx->ir->block_list) {
-		ra_block_name_instructions(ctx, block);
+/* This is the main entrypoint for picking a register. Pick a free register
+ * for "reg", shuffling around sources if necessary. In the normal case where
+ * "is_source" is false, this register can overlap with killed sources
+ * (intervals with "is_killed == true"). If "is_source" is true, then
+ * is_killed is ignored and the register returned must not overlap with killed
+ * sources. This must be used for tied registers, because we're actually
+ * allocating the destination and the tied source at the same time.
+ */
+
+static physreg_t
+get_reg(struct ra_ctx *ctx, struct ra_file *file, struct ir3_register *reg,
+		bool is_source)
+{
+	unsigned file_size = reg_file_size(file, reg);
+	if (reg->merge_set && reg->merge_set->preferred_reg != (physreg_t) ~0) {
+		physreg_t preferred_reg =
+			reg->merge_set->preferred_reg + reg->merge_set_offset;
+		if (preferred_reg < file_size &&
+			preferred_reg % reg_elem_size(reg) == 0 &&
+			get_reg_specified(file, reg, preferred_reg, is_source))
+			return preferred_reg;
 	}
 
-	/* figure out the base register name for each class.  The
-	 * actual ra name is class_base[cls] + instr->name;
+	/* If this register is a subset of a merge set which we have not picked a
+	 * register for, first try to allocate enough space for the entire merge
+	 * set.
 	 */
-	ctx->class_base[0] = 0;
-	for (unsigned i = 1; i <= total_class_count; i++) {
-		ctx->class_base[i] = ctx->class_base[i-1] +
-				ctx->class_alloc_count[i-1];
+	unsigned size = reg_size(reg);
+	if (reg->merge_set && reg->merge_set->preferred_reg == (physreg_t)~0 &&
+		size < reg->merge_set->size) {
+		physreg_t best_reg =
+			find_best_gap(file, file_size, reg->merge_set->size, reg->merge_set->alignment, is_source);
+		if (best_reg != (physreg_t) ~0u) {
+			best_reg += reg->merge_set_offset;
+			return best_reg;
+		}
 	}
 
-	/* and vreg names for array elements: */
-	base = ctx->class_base[total_class_count];
-	foreach_array (arr, &ctx->ir->array_list) {
-		arr->base = base;
-		ctx->class_alloc_count[total_class_count] += arr->length;
-		base += arr->length;
+	/* For ALU and SFU instructions, if the src reg is avail to pick, use it.
+	 * Because this doesn't introduce unnecessary dependencies, and it
+	 * potentially avoids needing (ss) syncs for write after read hazards for
+	 * SFU instructions:
+	 */
+	if (is_sfu(reg->instr) || is_alu(reg->instr)) {
+		for (unsigned i = 1; i < reg->instr->regs_count; i++) {
+			struct ir3_register *src = reg->instr->regs[i];
+			if (!ra_reg_is_src(src))
+				continue;
+			if (ra_get_file(ctx, src) == file && reg_size(src) >= size) {
+				struct ra_interval *src_interval =
+					&ctx->intervals[src->def->name];
+				physreg_t src_physreg = ra_interval_get_physreg(src_interval);
+				if (src_physreg % reg_elem_size(reg) == 0 &&
+					src_physreg + size <= file_size &&
+					get_reg_specified(file, reg, src_physreg, is_source))
+					return src_physreg;
+			}
+		}
 	}
-	ctx->alloc_count += ctx->class_alloc_count[total_class_count];
-
-	/* Add vreg names for r0.xyz */
-	ctx->r0_xyz_nodes = ctx->alloc_count;
-	ctx->alloc_count += 3;
-	ctx->hr0_xyz_nodes = ctx->alloc_count;
-	ctx->alloc_count += 3;
 
-	/* Add vreg name for prefetch-exclusion range: */
-	ctx->prefetch_exclude_node = ctx->alloc_count++;
+	physreg_t best_reg =
+		find_best_gap(file, file_size, size, reg_elem_size(reg), is_source);
+	if (best_reg != (physreg_t) ~0u) {
+		return best_reg;
+	}
 
-	if (RA_DEBUG) {
-		d("INSTRUCTION VREG NAMES:");
-		foreach_block (block, &ctx->ir->block_list) {
-			foreach_instr (instr, &block->instr_list) {
-				if (!ctx->instrd[instr->ip].defn)
-					continue;
-				if (!writes_gpr(instr))
-					continue;
-				di(instr, "%04u", scalar_name(ctx, instr, 0));
+	/* Ok, we couldn't find anything that fits. Here is where we have to start
+	 * moving things around to make stuff fit. First try solely evicting
+	 * registers in the way.
+	 */
+	unsigned best_eviction_count = ~0;
+	for (physreg_t i = 0; i + size <= file_size; i += reg_elem_size(reg)) {
+		unsigned eviction_count;
+		if (try_evict_regs(ctx, file, reg, i, &eviction_count, is_source, true)) {
+			if (eviction_count < best_eviction_count) {
+				best_eviction_count = eviction_count;
+				best_reg = i;
 			}
 		}
-		d("ARRAY VREG NAMES:");
-		foreach_array (arr, &ctx->ir->array_list) {
-			d("%04u: arr%u", arr->base, arr->id);
-		}
-		d("EXTRA VREG NAMES:");
-		d("%04u: r0_xyz_nodes", ctx->r0_xyz_nodes);
-		d("%04u: hr0_xyz_nodes", ctx->hr0_xyz_nodes);
-		d("%04u: prefetch_exclude_node", ctx->prefetch_exclude_node);
+	}
+	
+	if (best_eviction_count != ~0) {
+		ASSERTED bool result =
+			try_evict_regs(ctx, file, reg, best_reg, &best_eviction_count, is_source, false);
+		assert(result);
+		return best_reg;
 	}
 
-	ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
-	ralloc_steal(ctx->g, ctx->instrd);
-	ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
-	ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
-
-	/* TODO add selector callback for split (pre-a6xx) register file: */
-	if (ctx->v->mergedregs) {
-		ra_set_select_reg_callback(ctx->g, ra_select_reg_merged, ctx);
+	/* Use the dumb fallback only if try_evict_regs() fails. */
+	return compress_regs_left(ctx, file, reg_size(reg), reg_elem_size(reg), is_source);
+}
 
-		if (ctx->scalar_pass) {
-			ctx->name_to_instr = _mesa_hash_table_create(ctx->g,
-					_mesa_hash_int, _mesa_key_int_equal);
-		}
+static void
+assign_reg(struct ir3_instruction *instr, struct ir3_register *reg, unsigned num)
+{
+	if (reg->flags & IR3_REG_ARRAY) {
+		reg->array.base = num;
+		if (reg->flags & IR3_REG_RELATIV)
+			reg->array.offset += num;
+		else
+			reg->num = num + reg->array.offset;
+	} else {
+		reg->num = num;
 	}
 }
 
-/* Map the name back to instruction: */
-static struct ir3_instruction *
-name_to_instr(struct ir3_ra_ctx *ctx, unsigned name)
+static void
+mark_src_killed(struct ra_ctx *ctx, struct ir3_register *src)
 {
-	ra_assert(ctx, !name_is_array(ctx, name));
-	struct hash_entry *entry = _mesa_hash_table_search(ctx->name_to_instr, &name);
-	if (entry)
-		return entry->data;
-	ra_unreachable(ctx, "invalid instr name");
-	return NULL;
+	struct ra_interval *interval = &ctx->intervals[src->def->name];
+
+	if (!(src->flags & IR3_REG_FIRST_KILL) || interval->is_killed ||
+		interval->interval.parent || !rb_tree_is_empty(&interval->interval.children))
+		return;
+	
+	ra_file_mark_killed(ra_get_file(ctx, src), interval);
 }
 
-static bool
-name_is_array(struct ir3_ra_ctx *ctx, unsigned name)
+static void
+insert_dst(struct ra_ctx *ctx, struct ir3_register *dst)
 {
-	return name >= ctx->class_base[total_class_count];
+	struct ra_file *file = ra_get_file(ctx, dst);
+	struct ra_interval *interval = &ctx->intervals[dst->name];
+
+	d("insert dst %u physreg %u", dst->name, ra_interval_get_physreg(interval));
+
+	if (!(dst->flags & IR3_REG_UNUSED))
+		ra_file_insert(file, interval);
+
+	assign_reg(dst->instr, dst, ra_interval_get_num(interval));
 }
 
-static struct ir3_array *
-name_to_array(struct ir3_ra_ctx *ctx, unsigned name)
+static void
+allocate_dst_fixed(struct ra_ctx *ctx, struct ir3_register *dst, physreg_t physreg)
 {
-	ra_assert(ctx, name_is_array(ctx, name));
-	foreach_array (arr, &ctx->ir->array_list) {
-		if (name < (arr->base + arr->length))
-			return arr;
-	}
-	ra_unreachable(ctx, "invalid array name");
-	return NULL;
+	struct ra_interval *interval = &ctx->intervals[dst->name];
+	update_affinity(dst, physreg);
+
+	ra_interval_init(interval, dst);
+	interval->physreg_start = physreg;
+	interval->physreg_end = physreg + reg_size(dst);
 }
 
 static void
-ra_destroy(struct ir3_ra_ctx *ctx)
+allocate_dst(struct ra_ctx *ctx, struct ir3_register *dst)
 {
-	ralloc_free(ctx->g);
+	struct ra_file *file = ra_get_file(ctx, dst);
+
+	struct ir3_register *tied = ra_dst_get_tied_src(ctx->compiler, dst);
+	if (tied) {
+		struct ra_interval *tied_interval = &ctx->intervals[tied->def->name];
+		struct ra_interval *dst_interval = &ctx->intervals[dst->name];
+		physreg_t tied_physreg = ra_interval_get_physreg(tied_interval);
+		if (tied_interval->is_killed) {
+			/* The easy case: the source is killed, so we can just reuse it
+			 * for the destination.
+			 */
+			allocate_dst_fixed(ctx, dst, ra_interval_get_physreg(tied_interval));
+		} else {
+			/* The source is live-through, so we need to get a free register
+			 * (which is free for both the source and destination!), copy the
+			 * original source to it, then use that for the source and
+			 * destination.
+			 */
+			physreg_t physreg = get_reg(ctx, file, dst, true);
+			allocate_dst_fixed(ctx, dst, physreg);
+			array_insert(ctx, ctx->parallel_copies, (struct ra_parallel_copy) {
+					.interval = dst_interval,
+					.src = tied_physreg,
+			});
+		}
+
+		return;
+	}
+
+	/* All the hard work is done by get_reg here. */
+	physreg_t physreg = get_reg(ctx, file, dst, false);
+
+	allocate_dst_fixed(ctx, dst, physreg);
 }
 
 static void
-__def(struct ir3_ra_ctx *ctx, struct ir3_ra_block_data *bd, unsigned name,
-		struct ir3_instruction *instr)
+assign_src(struct ra_ctx *ctx, struct ir3_instruction *instr, struct ir3_register *src)
 {
-	ra_assert(ctx, name < ctx->alloc_count);
+	struct ra_interval *interval = &ctx->intervals[src->def->name];
+	struct ra_file *file = ra_get_file(ctx, src);
 
-	/* split/collect do not actually define any real value */
-	if ((instr->opc == OPC_META_SPLIT) || (instr->opc == OPC_META_COLLECT))
-		return;
+	bool array_rmw = ra_reg_is_array_rmw(src);
 
-	/* defined on first write: */
-	if (!ctx->def[name])
-		ctx->def[name] = instr->ip;
-	ctx->use[name] = MAX2(ctx->use[name], instr->ip);
-	BITSET_SET(bd->def, name);
+	struct ir3_register *tied = ra_src_get_tied_dst(ctx->compiler, instr, src);
+	physreg_t physreg;
+	if (tied) {
+		struct ra_interval *tied_interval = &ctx->intervals[tied->name];
+		physreg = ra_interval_get_physreg(tied_interval);
+	} else {
+		physreg = ra_interval_get_physreg(interval);
+	}
+
+	assign_reg(instr, src, ra_physreg_to_num(physreg, src->flags));
+
+	if (src->flags & IR3_REG_FIRST_KILL)
+		ra_file_remove(file, interval);
+
+	/* This source is also a destination. */
+	if (array_rmw) {
+		struct ra_interval *dst_interval = &ctx->intervals[src->name];
+		ra_interval_init(dst_interval, src);
+		dst_interval->physreg_start = physreg;
+		dst_interval->physreg_end = physreg + src->size * reg_elem_size(src);
+		ra_file_insert(file, dst_interval);
+	}
 }
 
+/* Insert a parallel copy instruction before the instruction with the parallel
+ * copy entries we've built up.
+ */
 static void
-__use(struct ir3_ra_ctx *ctx, struct ir3_ra_block_data *bd, unsigned name,
-		struct ir3_instruction *instr)
+insert_parallel_copy_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
 {
-	ra_assert(ctx, name < ctx->alloc_count);
-	ctx->use[name] = MAX2(ctx->use[name], instr->ip);
-	if (!BITSET_TEST(bd->def, name))
-		BITSET_SET(bd->use, name);
+	if (ctx->parallel_copies_count == 0)
+		return;
+
+	struct ir3_instruction *pcopy =
+		ir3_instr_create(instr->block, OPC_META_PARALLEL_COPY, 2 * ctx->parallel_copies_count);
+
+	for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
+		struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
+		struct ir3_register *reg =
+			ir3_reg_create(pcopy, ra_interval_get_num(entry->interval),
+						   entry->interval->interval.reg->flags & ~IR3_REG_SSA);
+		reg->size = entry->interval->interval.reg->size;
+		reg->wrmask = entry->interval->interval.reg->wrmask;
+	}
+
+	for (unsigned i = 0; i < ctx->parallel_copies_count; i++) {
+		struct ra_parallel_copy *entry = &ctx->parallel_copies[i];
+		struct ir3_register *reg =
+			ir3_reg_create(pcopy,
+						   ra_physreg_to_num(entry->src, entry->interval->interval.reg->flags),
+						   entry->interval->interval.reg->flags & ~(IR3_REG_DEST | IR3_REG_SSA));
+		reg->size = entry->interval->interval.reg->size;
+		reg->wrmask = entry->interval->interval.reg->wrmask;
+	}
+
+	list_del(&pcopy->node);
+	list_addtail(&pcopy->node, &instr->node);
+	ctx->parallel_copies_count = 0;
 }
 
 static void
-ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+handle_normal_instr(struct ra_ctx *ctx, struct ir3_instruction *instr)
 {
-	struct ir3_ra_block_data *bd;
-	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+	/* First, mark sources as going-to-be-killed while allocating the dest. */
+	ra_foreach_src(src, instr) {
+		mark_src_killed(ctx, src);
+	}
 
-#define def(name, instr) __def(ctx, bd, name, instr)
-#define use(name, instr) __use(ctx, bd, name, instr)
+	/* Allocate the destination. */
+	ra_foreach_dst(dst, instr) {
+		if (ra_reg_is_array_rmw(dst))
+			continue;
+		allocate_dst(ctx, dst);
+	}
 
-	bd = rzalloc(ctx->g, struct ir3_ra_block_data);
+	/* Now handle sources. Go backward so that in case there are multiple
+	 * sources with the same def and that def is killed we only remove it at
+	 * the end.
+	 */
+	ra_foreach_src_rev(src, instr) {
+		assign_src(ctx, instr, src);
+	}
 
-	bd->def     = rzalloc_array(bd, BITSET_WORD, bitset_words);
-	bd->use     = rzalloc_array(bd, BITSET_WORD, bitset_words);
-	bd->livein  = rzalloc_array(bd, BITSET_WORD, bitset_words);
-	bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
+	/* Now finally insert the destination into the map. */
+	ra_foreach_dst(dst, instr) {
+		if (ra_reg_is_array_rmw(dst))
+			continue;
+		insert_dst(ctx, dst);
+	}
 
-	block->data = bd;
+	insert_parallel_copy_instr(ctx, instr);
+}
 
-	struct ir3_instruction *first_non_input = NULL;
-	foreach_instr (instr, &block->instr_list) {
-		if (instr->opc != OPC_META_INPUT) {
-			first_non_input = instr;
-			break;
-		}
+static void
+handle_split(struct ra_ctx *ctx, struct ir3_instruction *instr)
+{
+	struct ir3_register *dst = instr->regs[0];
+	struct ir3_register *src = instr->regs[1];
+
+	if (dst->merge_set == NULL || src->def->merge_set != dst->merge_set) {
+		handle_normal_instr(ctx, instr);
+		return;
 	}
 
-	foreach_instr (instr, &block->instr_list) {
-		foreach_def (name, ctx, instr) {
-			if (name_is_array(ctx, name)) {
-				struct ir3_array *arr = name_to_array(ctx, name);
-
-				arr->start_ip = MIN2(arr->start_ip, instr->ip);
-				arr->end_ip = MAX2(arr->end_ip, instr->ip);
-
-				for (unsigned i = 0; i < arr->length; i++) {
-					unsigned name = arr->base + i;
-					if(arr->half)
-						ra_set_node_class(ctx->g, name, ctx->set->half_classes[0]);
-					else
-						ra_set_node_class(ctx->g, name, ctx->set->classes[0]);
-				}
-			} else {
-				struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
-				if (is_shared(instr)) {
-					ra_set_node_class(ctx->g, name,
-							ctx->set->shared_classes[id->cls - SHARED_OFFSET]);
-				} else if (is_half(instr)) {
-					ra_set_node_class(ctx->g, name,
-							ctx->set->half_classes[id->cls - HALF_OFFSET]);
-				} else {
-					ra_set_node_class(ctx->g, name,
-							ctx->set->classes[id->cls]);
-				}
-			}
+	struct ra_interval *src_interval = &ctx->intervals[src->def->name];
 
-			def(name, instr);
+	physreg_t physreg = ra_interval_get_physreg(src_interval);
+	assign_src(ctx, instr, src);
 
-			if ((instr->opc == OPC_META_INPUT) && first_non_input)
-				use(name, first_non_input);
+	allocate_dst_fixed(ctx, dst, physreg - src->def->merge_set_offset + dst->merge_set_offset);
+	insert_dst(ctx, dst);
+}
 
-			/* Texture instructions with writemasks can be treated as smaller
-			 * vectors (or just scalars!) to allocate knowing that the
-			 * masked-out regs won't be written, but we need to make sure that
-			 * the start of the vector doesn't come before the first register
-			 * or we'll wrap.
-			 */
-			if (is_tex_or_prefetch(instr)) {
-				int writemask_skipped_regs = ffs(instr->regs[0]->wrmask) - 1;
-				int r0_xyz = is_half(instr) ?
-					ctx->hr0_xyz_nodes : ctx->r0_xyz_nodes;
-				for (int i = 0; i < writemask_skipped_regs; i++)
-					ra_add_node_interference(ctx->g, name, r0_xyz + i);
-			}
+static void
+handle_collect(struct ra_ctx *ctx, struct ir3_instruction *instr)
+{
+	struct ir3_merge_set *dst_set = instr->regs[0]->merge_set;
+	unsigned dst_offset = instr->regs[0]->merge_set_offset;
+	
+	if (!dst_set || dst_set->regs_count == 1) {
+		handle_normal_instr(ctx, instr);
+		return;
+	}
 
-			/* Pre-fetched textures have a lower limit for bits to encode dst
-			 * register, so add additional interference with registers above
-			 * that limit.
-			 */
-			if (instr->opc == OPC_META_TEX_PREFETCH) {
-				ra_add_node_interference(ctx->g, name,
-						ctx->prefetch_exclude_node);
-			}
-		}
+	/* We need to check if any of the sources are contained in an interval
+	 * that is at least as large as the vector. In this case, we should put
+	 * the vector inside that larger interval. (There should be one
+	 * unambiguous place to put it, because values sharing the same merge set
+	 * should be allocated together.) This can happen in a case like:
+	 *
+	 * ssa_1 (wrmask=0xf) = ...
+	 * ssa_2 = split ssa_1 off:0
+	 * ssa_3 = split ssa_1 off:1
+	 * ssa_4 (wrmask=0x3) = collect (kill)ssa_2, (kill)ssa_3
+	 * ... = (kill)ssa_1
+	 * ... = (kill)ssa_4
+	 *
+	 * ssa_4 will be coalesced with ssa_1 and needs to be allocated inside it.
+	 */
+	physreg_t dst_fixed = (physreg_t) ~0u;
 
-		foreach_use (name, ctx, instr) {
-			if (name_is_array(ctx, name)) {
-				struct ir3_array *arr = name_to_array(ctx, name);
+	for (unsigned i = 1; i < instr->regs_count; i++) {
+		if (!ra_reg_is_src(instr->regs[i]))
+			continue;
 
-				arr->start_ip = MIN2(arr->start_ip, instr->ip);
-				arr->end_ip = MAX2(arr->end_ip, instr->ip);
+		if (instr->regs[i]->flags & IR3_REG_FIRST_KILL) {
+			mark_src_killed(ctx, instr->regs[i]);
+		}
 
-				/* NOTE: arrays are not SSA so unconditionally
-				 * set use bit:
-				 */
-				BITSET_SET(bd->use, name);
-			}
+		struct ir3_register *src = instr->regs[i];
+		struct ra_interval *interval = &ctx->intervals[src->def->name];
 
-			use(name, instr);
+		if (src->def->merge_set != dst_set || interval->is_killed)
+			continue;
+		while (interval->interval.parent != NULL) {
+			interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
 		}
-
-		foreach_name (name, ctx, instr) {
-			/* split/collect instructions have duplicate names
-			 * as real instructions, so they skip the hashtable:
+		if (reg_size(interval->interval.reg) >= reg_size(instr->regs[0])) {
+			dst_fixed = interval->physreg_start - interval->interval.reg->merge_set_offset + dst_offset;
+		} else {
+			/* For sources whose root interval is smaller than the
+			 * destination (i.e. the normal case), we will shuffle them
+			 * around after allocating the destination. Mark them killed so
+			 * that the destination can be allocated over them, even if they
+			 * aren't actually killed.
 			 */
-			if (ctx->name_to_instr && !((instr->opc == OPC_META_SPLIT) ||
-					(instr->opc == OPC_META_COLLECT))) {
-				/* this is slightly annoying, we can't just use an
-				 * integer on the stack
-				 */
-				unsigned *key = ralloc(ctx->name_to_instr, unsigned);
-				*key = name;
-				ra_assert(ctx, !_mesa_hash_table_search(ctx->name_to_instr, key));
-				_mesa_hash_table_insert(ctx->name_to_instr, key, instr);
-			}
+			ra_file_mark_killed(ra_get_file(ctx, src), interval);
 		}
 	}
-}
-
-static bool
-ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
-{
-	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
-	bool progress = false;
 
-	foreach_block (block, &ctx->ir->block_list) {
-		struct ir3_ra_block_data *bd = block->data;
+	if (dst_fixed != (physreg_t) ~0u)
+		allocate_dst_fixed(ctx, instr->regs[0], dst_fixed);
+	else
+		allocate_dst(ctx, instr->regs[0]);
 
-		/* update livein: */
-		for (unsigned i = 0; i < bitset_words; i++) {
-			/* anything used but not def'd within a block is
-			 * by definition a live value coming into the block:
-			 */
-			BITSET_WORD new_livein =
-				(bd->use[i] | (bd->liveout[i] & ~bd->def[i]));
+	/* Remove the temporary is_killed we added */
+	for (unsigned i = 1; i < instr->regs_count; i++) {
+		if (!ra_reg_is_src(instr->regs[i]))
+			continue;
 
-			if (new_livein & ~bd->livein[i]) {
-				bd->livein[i] |= new_livein;
-				progress = true;
-			}
+		struct ir3_register *src = instr->regs[i];
+		struct ra_interval *interval = &ctx->intervals[src->def->name];
+		while (interval->interval.parent != NULL) {
+			interval = ir3_reg_interval_to_ra_interval(interval->interval.parent);
 		}
 
-		/* update liveout: */
-		for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) {
-			struct ir3_block *succ = block->successors[j];
-			struct ir3_ra_block_data *succ_bd;
+		/* Filter out cases where it actually should be killed */
+		if (interval != &ctx->intervals[src->def->name] ||
+			!(src->flags & IR3_REG_KILL))
+			interval->is_killed = false;
+	}
 
-			if (!succ)
-				continue;
 
-			succ_bd = succ->data;
+	ra_foreach_src_rev(src, instr) {
+		assign_src(ctx, instr, src);
+	}
 
-			for (unsigned i = 0; i < bitset_words; i++) {
-				/* add anything that is livein in a successor block
-				 * to our liveout:
-				 */
-				BITSET_WORD new_liveout =
-					(succ_bd->livein[i] & ~bd->liveout[i]);
+	/* Note: insert_dst will automatically shuffle around any intervals that
+	 * are a child of the collect by making them children of the collect.
+	 */
 
-				if (new_liveout) {
-					bd->liveout[i] |= new_liveout;
-					progress = true;
-				}
-			}
-		}
-	}
+	insert_dst(ctx, instr->regs[0]);
 
-	return progress;
+	insert_parallel_copy_instr(ctx, instr);
 }
 
+/* Parallel copies before RA should only be at the end of the block, for
+ * phi's. For these we only need to fill in the sources, and then we fill in
+ * the destinations in the successor block.
+ */
 static void
-print_bitset(const char *name, BITSET_WORD *bs, unsigned cnt)
-{
-	bool first = true;
-	debug_printf("RA:  %s:", name);
-	for (unsigned i = 0; i < cnt; i++) {
-		if (BITSET_TEST(bs, i)) {
-			if (!first)
-				debug_printf(",");
-			debug_printf(" %04u", i);
-			first = false;
-		}
-	}
-	debug_printf("\n");
-}
-
-/* size of one component of instruction result, ie. half vs full: */
-static unsigned
-live_size(struct ir3_instruction *instr)
+handle_pcopy(struct ra_ctx *ctx, struct ir3_instruction *instr)
 {
-	if (is_half(instr)) {
-		return 1;
-	} else if (is_shared(instr)) {
-		/* doesn't count towards footprint */
-		return 0;
-	} else {
-		return 2;
+	ra_foreach_src_rev(src, instr) {
+		assign_src(ctx, instr, src);
 	}
 }
 
-static unsigned
-name_size(struct ir3_ra_ctx *ctx, unsigned name)
+/* Some inputs may need to be precolored. We need to handle those first, so
+ * that other non-precolored inputs don't accidentally get allocated over
+ * them. Inputs are the very first thing in the shader, so it shouldn't be a
+ * problem to allocate them to a specific physreg.
+ */
+
+static void
+handle_precolored_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
 {
-	if (name_is_array(ctx, name)) {
-		struct ir3_array *arr = name_to_array(ctx, name);
-		return arr->half ? 1 : 2;
-	} else {
-		struct ir3_instruction *instr = name_to_instr(ctx, name);
-		/* in scalar pass, each name represents on scalar value,
-		 * half or full precision
-		 */
-		return live_size(instr);
-	}
+	if (instr->regs[0]->num == INVALID_REG)
+		return;
+
+	struct ra_interval *interval = &ctx->intervals[instr->regs[0]->name];
+	physreg_t physreg = ra_reg_get_physreg(instr->regs[0]);
+	allocate_dst_fixed(ctx, instr->regs[0], physreg);
+	insert_dst(ctx, instr->regs[0]);
+	interval->frozen = true;
 }
 
-static unsigned
-ra_calc_block_live_values(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+static void
+handle_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
 {
-	struct ir3_ra_block_data *bd = block->data;
-	unsigned name;
+	if (instr->regs[0]->num != INVALID_REG)
+		return;
 
-	ra_assert(ctx, ctx->name_to_instr);
+	allocate_dst(ctx, instr->regs[0]);
 
-	/* TODO this gets a bit more complicated in non-scalar pass.. but
-	 * possibly a lowball estimate is fine to start with if we do
-	 * round-robin in non-scalar pass?  Maybe we just want to handle
-	 * that in a different fxn?
-	 */
-	ra_assert(ctx, ctx->scalar_pass);
+	struct ra_file *file = ra_get_file(ctx, instr->regs[0]);
+	struct ra_interval *interval = &ctx->intervals[instr->regs[0]->name];
+	ra_file_insert(file, interval);
+}
 
-	BITSET_WORD *live =
-		rzalloc_array(bd, BITSET_WORD, BITSET_WORDS(ctx->alloc_count));
+static void
+assign_input(struct ra_ctx *ctx, struct ir3_instruction *instr)
+{
+	struct ra_interval *interval = &ctx->intervals[instr->regs[0]->name];
+	struct ra_file *file = ra_get_file(ctx, instr->regs[0]);
 
-	/* Add the live input values: */
-	unsigned livein = 0;
-	BITSET_FOREACH_SET (name, bd->livein, ctx->alloc_count) {
-		livein += name_size(ctx, name);
-		BITSET_SET(live, name);
+	if (instr->regs[0]->num == INVALID_REG) {
+		assign_reg(instr, instr->regs[0], ra_interval_get_num(interval));
+	} else {
+		interval->frozen = false;
 	}
 
-	d("---------------------");
-	d("block%u: LIVEIN: %u", block_id(block), livein);
-
-	unsigned max = livein;
-	int cur_live = max;
-
-	/* Now that we know the live inputs to the block, iterate the
-	 * instructions adjusting the current # of live values as we
-	 * see their last use:
-	 */
-	foreach_instr (instr, &block->instr_list) {
-		if (RA_DEBUG)
-			print_bitset("LIVE", live, ctx->alloc_count);
-		di(instr, "CALC");
-
-		unsigned new_live = 0;    /* newly live values */
-		unsigned new_dead = 0;    /* newly no-longer live values */
-		unsigned next_dead = 0;   /* newly dead following this instr */
-
-		foreach_def (name, ctx, instr) {
-			/* NOTE: checking ctx->def filters out things like split/
-			 * collect which are just redefining existing live names
-			 * or array writes to already live array elements:
-			 */
-			if (ctx->def[name] != instr->ip)
-				continue;
-			new_live += live_size(instr);
-			d("NEW_LIVE: %u (new_live=%u, use=%u)", name, new_live, ctx->use[name]);
-			BITSET_SET(live, name);
-			/* There can be cases where this is *also* the last use
-			 * of a value, for example instructions that write multiple
-			 * values, only some of which are used.  These values are
-			 * dead *after* (rather than during) this instruction.
-			 */
-			if (ctx->use[name] != instr->ip)
-				continue;
-			next_dead += live_size(instr);
-			d("NEXT_DEAD: %u (next_dead=%u)", name, next_dead);
-			BITSET_CLEAR(live, name);
-		}
-
-		/* To be more resilient against special cases where liverange
-		 * is extended (like first_non_input), rather than using the
-		 * foreach_use() iterator, we iterate the current live values
-		 * instead:
-		 */
-		BITSET_FOREACH_SET (name, live, ctx->alloc_count) {
-			/* Is this the last use? */
-			if (ctx->use[name] != instr->ip)
-				continue;
-			new_dead += name_size(ctx, name);
-			d("NEW_DEAD: %u (new_dead=%u)", name, new_dead);
-			BITSET_CLEAR(live, name);
-		}
+	if (instr->regs[0]->flags & IR3_REG_UNUSED)
+		ra_file_remove(file, interval);
 
-		cur_live += new_live;
-		cur_live -= new_dead;
+	ra_foreach_src_rev(src, instr)
+		assign_src(ctx, instr, src);
+}
 
-		ra_assert(ctx, cur_live >= 0);
-		d("CUR_LIVE: %u", cur_live);
+/* chmask is a bit weird, because it has pre-colored sources due to the need
+ * to pass some registers to the next stage. Fortunately there are only at
+ * most two, and there should be no other live values by the time we get to
+ * this instruction, so we only have to do the minimum and don't need any
+ * fancy fallbacks.
+ *
+ * TODO: Add more complete handling of precolored sources, e.g. for function
+ * argument handling. We'd need a way to mark sources as fixed so that they
+ * don't get moved around when placing other sources in the fallback case, and
+ * a duplication of much of the logic in get_reg(). This also opens another
+ * can of worms, e.g. what if the precolored source is a split of a vector
+ * which is still live -- this breaks our assumption that splits don't incur
+ * any "extra" register requirements and we'd have to break it out of the
+ * parent ra_interval.
+ */
 
-		max = MAX2(max, cur_live);
+static void
+handle_precolored_source(struct ra_ctx *ctx, struct ir3_register *src)
+{
+	struct ra_file *file = ra_get_file(ctx, src);
+	struct ra_interval *interval = &ctx->intervals[src->def->name];
+	physreg_t physreg = ra_reg_get_physreg(src);
 
-		/* account for written values which are not used later,
-		 * but after updating max (since they are for one cycle
-		 * live)
-		 */
-		cur_live -= next_dead;
-		ra_assert(ctx, cur_live >= 0);
+	if (ra_interval_get_num(interval) == src->num)
+		return;
 
-		if (RA_DEBUG) {
-			unsigned cnt = 0;
-			BITSET_FOREACH_SET (name, live, ctx->alloc_count) {
-				cnt += name_size(ctx, name);
-			}
-			ra_assert(ctx, cur_live == cnt);
+	/* Try evicting stuff in our way if it isn't free. This won't move
+	 * anything unless it overlaps with our precolored physreg, so we don't
+	 * have to worry about evicting other precolored sources.
+	 */
+	if (!get_reg_specified(file, src, physreg, true)) {
+		unsigned eviction_count;
+		if (!try_evict_regs(ctx, file, src, physreg, &eviction_count, true, false)) {
+			unreachable("failed to evict for precolored source!");
+			return;
 		}
 	}
 
-	d("block%u max=%u", block_id(block), max);
+	ra_move_interval(ctx, file, interval, physreg);
+}
 
-	/* the remaining live should match liveout (for extra sanity testing): */
-	if (RA_DEBUG) {
-		unsigned new_dead = 0;
-		BITSET_FOREACH_SET (name, live, ctx->alloc_count) {
-			/* Is this the last use? */
-			if (ctx->use[name] != block->end_ip)
-				continue;
-			new_dead += name_size(ctx, name);
-			d("NEW_DEAD: %u (new_dead=%u)", name, new_dead);
-			BITSET_CLEAR(live, name);
-		}
-		unsigned liveout = 0;
-		BITSET_FOREACH_SET (name, bd->liveout, ctx->alloc_count) {
-			liveout += name_size(ctx, name);
-			BITSET_CLEAR(live, name);
-		}
+static void
+handle_chmask(struct ra_ctx *ctx, struct ir3_instruction *instr)
+{
+	/* Note: we purposely don't mark sources as killed, so that we can reuse
+	 * some of the get_reg() machinery as-if the source is a destination.
+	 * Marking it as killed would make e.g. get_reg_specified() wouldn't work
+	 * correctly.
+	 */
+	ra_foreach_src(src, instr) {
+		assert(src->num != INVALID_REG);
+		handle_precolored_source(ctx, src);
+	}
 
-		if (cur_live != liveout) {
-			print_bitset("LEAKED", live, ctx->alloc_count);
-			/* TODO there are a few edge cases where live-range extension
-			 * tells us a value is livein.  But not used by the block or
-			 * liveout for the block.  Possibly a bug in the liverange
-			 * extension.  But for now leave the assert disabled:
-			ra_assert(ctx, cur_live == liveout);
-			 */
-		}
+	ra_foreach_src(src, instr) {
+		struct ra_file *file = ra_get_file(ctx, src);
+		struct ra_interval *interval = &ctx->intervals[src->def->name];
+		if (src->flags & IR3_REG_FIRST_KILL)
+			ra_file_remove(file, interval);
 	}
 
-	ralloc_free(live);
+	/* add dummy destination for validation */
+	assign_reg(instr, instr->regs[0], 0);
 
-	return max;
+	insert_parallel_copy_instr(ctx, instr);
 }
 
-static unsigned
-ra_calc_max_live_values(struct ir3_ra_ctx *ctx)
+static physreg_t
+read_register(struct ra_ctx *ctx, struct ir3_block *block, struct ir3_register *def)
 {
-	unsigned max = 0;
-
-	foreach_block (block, &ctx->ir->block_list) {
-		unsigned block_live = ra_calc_block_live_values(ctx, block);
-		max = MAX2(max, block_live);
+	struct ra_block_state *state = &ctx->blocks[block->index];
+	if (state->renames) {
+		struct hash_entry *entry = _mesa_hash_table_search(state->renames, def);
+		if (entry) {
+			return (physreg_t)(uintptr_t)entry->data;
+		}
 	}
 
-	return max;
+	return ra_reg_get_physreg(def);
 }
 
 static void
-ra_add_interference(struct ir3_ra_ctx *ctx)
+handle_live_in(struct ra_ctx *ctx, struct ir3_register *def)
 {
-	struct ir3 *ir = ctx->ir;
+	physreg_t physreg = ~0;
+	for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
+		struct ir3_block *pred = ctx->block->predecessors[i];
+		struct ra_block_state *pred_state = &ctx->blocks[pred->index];
 
-	/* initialize array live ranges: */
-	foreach_array (arr, &ir->array_list) {
-		arr->start_ip = ~0;
-		arr->end_ip = 0;
-	}
+		if (!pred_state->visited)
+			continue;
 
-	/* set up the r0.xyz precolor regs. */
-	for (int i = 0; i < 3; i++) {
-		ra_set_node_reg(ctx->g, ctx->r0_xyz_nodes + i, i);
-		ra_set_node_reg(ctx->g, ctx->hr0_xyz_nodes + i,
-				ctx->set->first_half_reg + i);
+		physreg = read_register(ctx, pred, def);
+		break;
 	}
 
-	/* pre-color node that conflict with half/full regs higher than what
-	 * can be encoded for tex-prefetch:
-	 */
-	ra_set_node_reg(ctx->g, ctx->prefetch_exclude_node,
-			ctx->set->prefetch_exclude_reg);
+	assert(physreg != (physreg_t)~0);
+
+	struct ra_interval *interval = &ctx->intervals[def->name];
+	struct ra_file *file = ra_get_file(ctx, def);
+	ra_interval_init(interval, def);
+	interval->physreg_start = physreg;
+	interval->physreg_end = physreg + reg_size(def);
+	ra_file_insert(file, interval);
+}
 
-	/* compute live ranges (use/def) on a block level, also updating
-	 * block's def/use bitmasks (used below to calculate per-block
-	 * livein/liveout):
+static void
+handle_live_out(struct ra_ctx *ctx, struct ir3_register *def)
+{
+	/* Skip parallelcopy's which in the original program are only used as phi
+	 * arguments. Even though phi arguments are live out, they are only
+	 * assigned when the phi is.
 	 */
-	foreach_block (block, &ir->block_list) {
-		ra_block_compute_live_ranges(ctx, block);
+	if (def->instr->opc == OPC_META_PARALLEL_COPY)
+		return;
+
+	struct ra_block_state *state = &ctx->blocks[ctx->block->index];
+	struct ra_interval *interval = &ctx->intervals[def->name];
+	physreg_t physreg = ra_interval_get_physreg(interval);
+	if (physreg != ra_reg_get_physreg(def)) {
+		if (!state->renames)
+			state->renames = _mesa_pointer_hash_table_create(ctx);
+		_mesa_hash_table_insert(state->renames, def, (void *)(uintptr_t)physreg);
 	}
+}
 
-	/* update per-block livein/liveout: */
-	while (ra_compute_livein_liveout(ctx)) {}
+static void
+handle_phi(struct ra_ctx *ctx, struct ir3_register *def)
+{
+	struct ra_file *file = ra_get_file(ctx, def);
+	struct ra_interval *interval = &ctx->intervals[def->name];
 
-	if (RA_DEBUG) {
-		d("AFTER LIVEIN/OUT:");
-		foreach_block (block, &ir->block_list) {
-			struct ir3_ra_block_data *bd = block->data;
-			d("block%u:", block_id(block));
-			print_bitset("  def", bd->def, ctx->alloc_count);
-			print_bitset("  use", bd->use, ctx->alloc_count);
-			print_bitset("  l/i", bd->livein, ctx->alloc_count);
-			print_bitset("  l/o", bd->liveout, ctx->alloc_count);
-		}
-		foreach_array (arr, &ir->array_list) {
-			d("array%u:", arr->id);
-			d("   length:   %u", arr->length);
-			d("   start_ip: %u", arr->start_ip);
-			d("   end_ip:   %u", arr->end_ip);
-		}
+	/* phis are always scalar, so they should already be the smallest possible 
+	 * size. However they may be coalesced with other live-in values/phi
+	 * nodes, so check for that here.
+	 */
+	struct ir3_reg_interval *parent_ir3 =
+		ir3_reg_interval_search(&file->reg_ctx.intervals, def->interval_start);
+	physreg_t physreg;
+	if (parent_ir3) {
+		struct ra_interval *parent = ir3_reg_interval_to_ra_interval(parent_ir3);
+		physreg = ra_interval_get_physreg(parent) +
+			(def->interval_start - parent_ir3->reg->interval_start);
+	} else {
+		physreg = get_reg(ctx, file, def, false);
 	}
 
-	/* extend start/end ranges based on livein/liveout info from cfg: */
-	foreach_block (block, &ir->block_list) {
-		struct ir3_ra_block_data *bd = block->data;
-
-		for (unsigned i = 0; i < ctx->alloc_count; i++) {
-			if (BITSET_TEST(bd->livein, i)) {
-				ctx->def[i] = MIN2(ctx->def[i], block->start_ip);
-				ctx->use[i] = MAX2(ctx->use[i], block->start_ip);
-			}
+	allocate_dst_fixed(ctx, def, physreg);
 
-			if (BITSET_TEST(bd->liveout, i)) {
-				ctx->def[i] = MIN2(ctx->def[i], block->end_ip);
-				ctx->use[i] = MAX2(ctx->use[i], block->end_ip);
-			}
-		}
+	ra_file_insert(file, interval);
+}
 
-		foreach_array (arr, &ctx->ir->array_list) {
-			for (unsigned i = 0; i < arr->length; i++) {
-				if (BITSET_TEST(bd->livein, i + arr->base)) {
-					arr->start_ip = MIN2(arr->start_ip, block->start_ip);
-				}
-				if (BITSET_TEST(bd->liveout, i + arr->base)) {
-					arr->end_ip = MAX2(arr->end_ip, block->end_ip);
-				}
-			}
+static void
+assign_phi(struct ra_ctx *ctx, struct ir3_instruction *phi)
+{
+	struct ra_file *file = ra_get_file(ctx, phi->regs[0]);
+	struct ra_interval *interval = &ctx->intervals[phi->regs[0]->name];
+	assert(!interval->interval.parent);
+	unsigned num = ra_interval_get_num(interval);
+	assign_reg(phi, phi->regs[0], num);
+
+	/* Assign the parallelcopy sources of this phi */
+	for (unsigned i = 1; i < phi->regs_count; i++) {
+		if (phi->regs[i]->def) {
+			assign_reg(phi, phi->regs[i], num);
+			assign_reg(phi, phi->regs[i]->def, num);
 		}
 	}
 
-	if (ctx->name_to_instr) {
-		unsigned max = ra_calc_max_live_values(ctx);
-		ra_set_register_target(ctx, max);
-	}
-
-	for (unsigned i = 0; i < ctx->alloc_count; i++) {
-		for (unsigned j = 0; j < ctx->alloc_count; j++) {
-			if (intersects(ctx->def[i], ctx->use[i],
-					ctx->def[j], ctx->use[j])) {
-				ra_add_node_interference(ctx->g, i, j);
-			}
-		}
-	}
+	if (phi->regs[0]->flags & IR3_REG_UNUSED)
+		ra_file_remove(file, interval);
 }
 
-/* NOTE: instr could be NULL for IR3_REG_ARRAY case, for the first
- * array access(es) which do not have any previous access to depend
- * on from scheduling point of view
+/* When we split a live range, we sometimes need to emit fixup code at the end
+ * of a block. For example, something like:
+ *
+ * a = ...
+ * if (...) {
+ *    ...
+ *    a' = a
+ *    b = ... // a evicted to make room for b
+ *    ...
+ * }
+ * ... = a
+ *
+ * When we insert the copy to a' in insert_parallel_copy_instr(), this forces
+ * to insert another copy "a = a'" at the end of the if. Normally this would
+ * also entail adding a phi node, but since we're about to go out of SSA
+ * anyway we just insert an extra move. Note, however, that "b" might be used
+ * in a phi node at the end of the if and share registers with "a", so we
+ * have to be careful to extend any preexisting parallelcopy instruction
+ * instead of creating our own in order to guarantee that they properly get
+ * swapped.
  */
+
 static void
-reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
-		struct ir3_instruction *instr)
+insert_liveout_copy(struct ir3_block *block, physreg_t dst, physreg_t src,
+					struct ir3_register *reg)
 {
-	struct ir3_ra_instr_data *id;
-
-	if (reg->flags & IR3_REG_ARRAY) {
-		struct ir3_array *arr =
-			ir3_lookup_array(ctx->ir, reg->array.id);
-		unsigned name = arr->base + reg->array.offset;
-		unsigned r = ra_get_node_reg(ctx->g, name);
-		unsigned num = ctx->set->ra_reg_to_gpr[r];
-
-		if (reg->flags & IR3_REG_RELATIV) {
-			reg->array.base = arr->reg;
-			reg->array.offset = num;
-		} else {
-			reg->num = num;
-			reg->flags &= ~IR3_REG_SSA;
-		}
+	struct ir3_instruction *old_pcopy = NULL;
+	if (!list_is_empty(&block->instr_list)) {
+		struct ir3_instruction *last =
+			LIST_ENTRY(struct ir3_instruction, block->instr_list.prev, node);
+		if (last->opc == OPC_META_PARALLEL_COPY)
+			old_pcopy = last;
+	}
 
-		reg->flags &= ~IR3_REG_ARRAY;
-	} else if ((id = &ctx->instrd[instr->ip]) && id->defn) {
-		unsigned first_component = 0;
+	unsigned old_pcopy_regs = old_pcopy ? old_pcopy->regs_count : 0;
+	struct ir3_instruction *pcopy =
+		ir3_instr_create(block, OPC_META_PARALLEL_COPY,
+						 2 + old_pcopy_regs);
 
-		/* Special case for tex instructions, which may use the wrmask
-		 * to mask off the first component(s).  In the scalar pass,
-		 * this means the masked off component(s) are not def'd/use'd,
-		 * so we get a bogus value when we ask the register_allocate
-		 * algo to get the assigned reg for the unused/untouched
-		 * component.  So we need to consider the first used component:
-		 */
-		if (ctx->scalar_pass && is_tex_or_prefetch(id->defn)) {
-			unsigned n = ffs(id->defn->regs[0]->wrmask);
-			ra_assert(ctx, n > 0);
-			first_component = n - 1;
-		}
+	for (unsigned i = 0; i < old_pcopy_regs / 2; i++) {
+		pcopy->regs[pcopy->regs_count++] = old_pcopy->regs[i];
+	}
 
-		unsigned name = scalar_name(ctx, id->defn, first_component);
-		unsigned r = ra_get_node_reg(ctx->g, name);
-		unsigned num = ctx->set->ra_reg_to_gpr[r] + id->off;
+	struct ir3_register *dst_reg =
+		ir3_reg_create(pcopy, ra_physreg_to_num(dst, reg->flags), reg->flags);
+	dst_reg->wrmask = reg->wrmask;
+	dst_reg->size = reg->size;
 
-		ra_assert(ctx, !(reg->flags & IR3_REG_RELATIV));
+	for (unsigned i = old_pcopy_regs / 2; i < old_pcopy_regs; i++) {
+		pcopy->regs[pcopy->regs_count++] = old_pcopy->regs[i];
+	}
 
-		ra_assert(ctx, num >= first_component);
+	struct ir3_register *src_reg =
+		ir3_reg_create(pcopy, ra_physreg_to_num(src, reg->flags),
+					   reg->flags & ~IR3_REG_DEST);
+	src_reg->wrmask = reg->wrmask;
+	src_reg->size = reg->size;
 
-		if (is_shared(id->defn))
-			num += FIRST_SHARED_REG;
+	if (old_pcopy)
+		list_del(&old_pcopy->node);
+}
 
-		reg->num = num - first_component;
+static void
+insert_live_in_move(struct ra_ctx *ctx, struct ra_interval *interval)
+{
+	physreg_t physreg = ra_interval_get_physreg(interval);
+	
+	for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
+		struct ir3_block *pred = ctx->block->predecessors[i];
+		struct ra_block_state *pred_state = &ctx->blocks[pred->index];
 
-		reg->flags &= ~IR3_REG_SSA;
+		if (!pred_state->visited)
+			continue;
 
-		if (is_half(id->defn))
-			reg->flags |= IR3_REG_HALF;
+		physreg_t pred_reg = read_register(ctx, pred, interval->interval.reg);
+		if (pred_reg != physreg) {
+			insert_liveout_copy(pred, physreg, pred_reg, interval->interval.reg);
+		}
 	}
 }
 
-/* helper to determine which regs to assign in which pass: */
-static bool
-should_assign(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr)
+static void
+insert_file_live_in_moves(struct ra_ctx *ctx, struct ra_file *file)
 {
-	/* Array regs are precolored completely separately, and we need to keep
-	 * their array-ness until the end to be able to compute the array reg's
-	 * live interval in the scalar pass.
-	 */
-	if (instr->regs[0]->flags & IR3_REG_ARRAY)
-		return ctx->scalar_pass;
-
-	if ((instr->opc == OPC_META_SPLIT) &&
-			(util_bitcount(instr->regs[1]->wrmask) > 1))
-		return !ctx->scalar_pass;
-	if ((instr->opc == OPC_META_COLLECT) &&
-			(util_bitcount(instr->regs[0]->wrmask) > 1))
-		return !ctx->scalar_pass;
-	return ctx->scalar_pass;
+	BITSET_WORD *live_in = ctx->live->live_in[ctx->block->index];
+	rb_tree_foreach(struct ra_interval, interval, &file->physreg_intervals, physreg_node) {
+		/* Skip phi nodes. This needs to happen after phi nodes are allocated,
+		 * because we may have to move live-ins around to make space for phi
+		 * nodes, but we shouldn't be handling phi nodes here.
+		 */
+		if (BITSET_TEST(live_in, interval->interval.reg->name))
+			insert_live_in_move(ctx, interval);
+	}
 }
 
 static void
-ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+insert_entry_regs(struct ra_block_state *state, struct ra_file *file)
 {
-	foreach_instr (instr, &block->instr_list) {
-
-		if (writes_gpr(instr)) {
-			if (should_assign(ctx, instr)) {
-				reg_assign(ctx, instr->regs[0], instr);
-			}
-		}
-
-		foreach_src_n (reg, n, instr) {
-			struct ir3_instruction *src = reg->def ? reg->def->instr : NULL;
-
-			if (src && should_assign(ctx, instr))
-				reg_assign(ctx, src->regs[0], src);
-
-			/* Note: reg->def could be null for IR3_REG_ARRAY */
-			if (((reg->flags & IR3_REG_ARRAY) && ctx->scalar_pass) ||
-				(src && should_assign(ctx, src))) {
-				reg_assign(ctx, instr->regs[n+1], src);
-			}
-		}
-	}
-
-	/* We need to pre-color outputs for the scalar pass in
-	 * ra_precolor_assigned(), so we need to actually assign
-	 * them in the first pass:
-	 */
-	if (!ctx->scalar_pass) {
-		foreach_input (in, ctx->ir) {
-			reg_assign(ctx, in->regs[0], in);
-		}
+	rb_tree_foreach(struct ra_interval, interval, &file->physreg_intervals, physreg_node) {
+		_mesa_hash_table_insert(state->entry_regs, interval->interval.reg,
+				(void *)(uintptr_t)interval->physreg_start);
 	}
 }
 
 static void
-assign_arr_base(struct ir3_ra_ctx *ctx, struct ir3_array *arr,
-		struct ir3_instruction **precolor, unsigned nprecolor)
+insert_live_in_moves(struct ra_ctx *ctx)
 {
-	/* In the mergedregs case, we convert full precision arrays
-	 * to their effective half-precision base, and find conflicts
-	 * amongst all other arrays/inputs.
-	 *
-	 * In the splitregs case (halfreg file and fullreg file do
-	 * not conflict), we ignore arrays and other pre-colors that
-	 * are not the same precision.
-	 */
-	bool mergedregs = ctx->v->mergedregs;
-	unsigned base = 0;
+	insert_file_live_in_moves(ctx, &ctx->full);
+	insert_file_live_in_moves(ctx, &ctx->half);
+	insert_file_live_in_moves(ctx, &ctx->shared);
 
-	/* figure out what else we conflict with which has already
-	 * been assigned:
+	/* If not all predecessors are visited, insert live-in regs so that
+	 * insert_live_out_moves() will work.
 	 */
-retry:
-	foreach_array (arr2, &ctx->ir->array_list) {
-		if (arr2 == arr)
+	bool all_preds_visited = true;
+	for (unsigned i = 0; i < ctx->block->predecessors_count; i++) {
+		if (!ctx->blocks[ctx->block->predecessors[i]->index].visited) {
+			all_preds_visited = false;
 			break;
-		ra_assert(ctx, arr2->start_ip <= arr2->end_ip);
-
-		unsigned base2 = arr2->reg;
-		unsigned len2  = arr2->length;
-		unsigned len   = arr->length;
-
-		if (mergedregs) {
-			/* convert into half-reg space: */
-			if (!arr2->half) {
-				base2 *= 2;
-				len2  *= 2;
-			}
-			if (!arr->half) {
-				len   *= 2;
-			}
-		} else if (arr2->half != arr->half) {
-			/* for split-register-file mode, we only conflict with
-			 * other arrays of same precision:
-			 */
-			continue;
-		}
-
-		/* if it intersects with liverange AND register range.. */
-		if (intersects(arr->start_ip, arr->end_ip,
-				arr2->start_ip, arr2->end_ip) &&
-			intersects(base, base + len,
-				base2, base2 + len2)) {
-			base = MAX2(base, base2 + len2);
-			goto retry;
 		}
 	}
 
-	/* also need to not conflict with any pre-assigned inputs: */
-	for (unsigned i = 0; i < nprecolor; i++) {
-		struct ir3_instruction *instr = precolor[i];
+	if (!all_preds_visited) {
+		struct ra_block_state *state = &ctx->blocks[ctx->block->index];
+		state->entry_regs = _mesa_pointer_hash_table_create(ctx);
+		
+		insert_entry_regs(state, &ctx->full);
+		insert_entry_regs(state, &ctx->half);
+		insert_entry_regs(state, &ctx->shared);
+	}
+}
 
-		if (!instr || (instr->flags & IR3_INSTR_UNUSED))
+static void
+insert_live_out_move(struct ra_ctx *ctx, struct ra_interval *interval)
+{
+	for (unsigned i = 0; i < 2; i++) {
+		if (!ctx->block->successors[i])
 			continue;
 
-		struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
+		struct ir3_block *succ = ctx->block->successors[i];
+		struct ra_block_state *succ_state = &ctx->blocks[succ->index];
 
-		/* only consider the first component: */
-		if (id->off > 0)
+		if (!succ_state->visited)
 			continue;
 
-		unsigned name   = ra_name(ctx, id);
-		unsigned regid  = instr->regs[0]->num;
-		unsigned reglen = class_sizes[id->cls];
-		unsigned len    = arr->length;
-
-		if (mergedregs) {
-			/* convert into half-reg space: */
-			if (!is_half(instr)) {
-				regid  *= 2;
-				reglen *= 2;
-			}
-			if (!arr->half) {
-				len   *= 2;
-			}
-		} else if (is_half(instr) != arr->half) {
-			/* for split-register-file mode, we only conflict with
-			 * other arrays of same precision:
-			 */
+		struct hash_entry *entry =
+			_mesa_hash_table_search(succ_state->entry_regs, interval->interval.reg);
+		if (!entry)
 			continue;
-		}
 
-		/* Check if array intersects with liverange AND register
-		 * range of the input:
-		 */
-		if (intersects(arr->start_ip, arr->end_ip,
-						ctx->def[name], ctx->use[name]) &&
-				intersects(base, base + len,
-						regid, regid + reglen)) {
-			base = MAX2(base, regid + reglen);
-			goto retry;
+		physreg_t new_reg = (physreg_t)(uintptr_t)entry->data;
+		if (new_reg != interval->physreg_start) {
+			insert_liveout_copy(ctx->block, new_reg, interval->physreg_start,
+								interval->interval.reg);
 		}
 	}
+}
 
-	/* convert back from half-reg space to fullreg space: */
-	if (mergedregs && !arr->half) {
-		base = DIV_ROUND_UP(base, 2);
+static void
+insert_file_live_out_moves(struct ra_ctx *ctx, struct ra_file *file)
+{
+	rb_tree_foreach(struct ra_interval, interval, &file->physreg_intervals, physreg_node) {
+		insert_live_out_move(ctx, interval);
 	}
+}
 
-	arr->reg = base;
+static void
+insert_live_out_moves(struct ra_ctx *ctx)
+{
+	insert_file_live_out_moves(ctx, &ctx->full);
+	insert_file_live_out_moves(ctx, &ctx->half);
+	insert_file_live_out_moves(ctx, &ctx->shared);
 }
 
-/* handle pre-colored registers.  This includes "arrays" (which could be of
- * length 1, used for phi webs lowered to registers in nir), as well as
- * special shader input values that need to be pinned to certain registers.
- */
 static void
-ra_precolor(struct ir3_ra_ctx *ctx, struct ir3_instruction **precolor, unsigned nprecolor)
+handle_block(struct ra_ctx *ctx, struct ir3_block *block)
 {
-	for (unsigned i = 0; i < nprecolor; i++) {
-		if (precolor[i] && !(precolor[i]->flags & IR3_INSTR_UNUSED)) {
-			struct ir3_instruction *instr = precolor[i];
+	ctx->block = block;
+
+	/* Reset the register files from the last block */
+	ra_file_init(&ctx->full);
+	ra_file_init(&ctx->half);
+	ra_file_init(&ctx->shared);
+
+	/* Handle live-ins, phis, and input meta-instructions. These all appear
+	 * live at the beginning of the block, and interfere with each other
+	 * therefore need to be allocated "in parallel". This means that we
+	 * have to allocate all of them, inserting them into the file, and then
+	 * delay updating the IR until all of them are allocated.
+	 *
+	 * Handle precolored inputs first, because we need to make sure that other
+	 * inputs don't overwrite them. We shouldn't have both live-ins/phi nodes
+	 * and inputs at the same time, because the first block doesn't have
+	 * predecessors. Therefore handle_live_in doesn't have to worry about
+	 * them.
+	 */
 
-			if (instr->regs[0]->num == INVALID_REG)
-				continue;
+	foreach_instr (instr, &block->instr_list) {
+		if (instr->opc == OPC_META_INPUT)
+			handle_precolored_input(ctx, instr);
+		else
+			break;
+	}
 
-			struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
-
-			ra_assert(ctx, !(instr->regs[0]->flags & (IR3_REG_HALF | IR3_REG_SHARED)));
-
-			/* 'base' is in scalar (class 0) but we need to map that
-			 * the conflicting register of the appropriate class (ie.
-			 * input could be vec2/vec3/etc)
-			 *
-			 * Note that the higher class (larger than scalar) regs
-			 * are setup to conflict with others in the same class,
-			 * so for example, R1 (scalar) is also the first component
-			 * of D1 (vec2/double):
-			 *
-			 *    Single (base) |  Double
-			 *    --------------+---------------
-			 *       R0         |  D0
-			 *       R1         |  D0 D1
-			 *       R2         |     D1 D2
-			 *       R3         |        D2
-			 *           .. and so on..
-			 */
-			unsigned regid = instr->regs[0]->num;
-			ra_assert(ctx, regid >= id->off);
-			regid -= id->off;
+	unsigned name;
+	BITSET_FOREACH_SET(name, ctx->live->live_in[block->index],
+					   ctx->live->definitions_count) {
+		struct ir3_register *reg = ctx->live->definitions[name];
+		handle_live_in(ctx, reg);
+	}
 
-			unsigned reg = ctx->set->gpr_to_ra_reg[id->cls][regid];
-			unsigned name = ra_name(ctx, id);
-			ra_set_node_reg(ctx->g, name, reg);
-		}
+	foreach_instr (instr, &block->instr_list) {
+		if (instr->opc == OPC_META_PHI)
+			handle_phi(ctx, instr->regs[0]);
+		else if (instr->opc == OPC_META_INPUT || instr->opc == OPC_META_TEX_PREFETCH)
+			handle_input(ctx, instr);
+		else
+			break;
 	}
 
-	/*
-	 * Pre-assign array elements:
+	/* After this point, every live-in/phi/input has an interval assigned to
+	 * it. We delay actually assigning values until everything has been
+	 * allocated, so we can simply ignore any parallel copy entries created
+	 * when shuffling them around.
 	 */
-	foreach_array (arr, &ctx->ir->array_list) {
-
-		if (arr->end_ip == 0)
-			continue;
-
-		if (!ctx->scalar_pass)
-			assign_arr_base(ctx, arr, precolor, nprecolor);
+	ctx->parallel_copies_count = 0;
 
-		for (unsigned i = 0; i < arr->length; i++) {
-			unsigned cls = arr->half ? HALF_OFFSET : 0;
+	insert_live_in_moves(ctx);
 
-			ra_set_node_reg(ctx->g,
-					arr->base + i,   /* vreg name */
-					ctx->set->gpr_to_ra_reg[cls][arr->reg + i]);
-		}
+	if (RA_DEBUG) {
+		printf("after live-in block %u:\n", block->index);
+		ra_ctx_dump(ctx);
 	}
 
-	if (ir3_shader_debug & IR3_DBG_OPTMSGS) {
-		foreach_array (arr, &ctx->ir->array_list) {
-			unsigned first = arr->reg;
-			unsigned last  = arr->reg + arr->length - 1;
-			debug_printf("arr[%d] at r%d.%c->r%d.%c\n", arr->id,
-					(first >> 2), "xyzw"[first & 0x3],
-					(last >> 2), "xyzw"[last & 0x3]);
+	/* Now we're done with processing live-ins, and can handle the body of the
+	 * block.
+	 */
+	foreach_instr (instr, &block->instr_list) {
+		if (RA_DEBUG) {
+			printf("processing: ");
+			ir3_print_instr(instr);
 		}
+
+		if (instr->opc == OPC_META_PHI)
+			assign_phi(ctx, instr);
+		else if (instr->opc == OPC_META_INPUT || instr->opc == OPC_META_TEX_PREFETCH)
+			assign_input(ctx, instr);
+		else if (instr->opc == OPC_META_SPLIT)
+			handle_split(ctx, instr);
+		else if (instr->opc == OPC_META_COLLECT)
+			handle_collect(ctx, instr);
+		else if (instr->opc == OPC_META_PARALLEL_COPY)
+			handle_pcopy(ctx, instr);
+		else if (instr->opc == OPC_CHMASK)
+			handle_chmask(ctx, instr);
+		else
+			handle_normal_instr(ctx, instr);
+
+		if (RA_DEBUG)
+			ra_ctx_dump(ctx);
 	}
-}
 
-static void
-precolor(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr)
-{
-	struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
-	unsigned n = dest_regs(instr);
-	for (unsigned i = 0; i < n; i++) {
-		/* tex instructions actually have a wrmask, and
-		 * don't touch masked out components.  So we
-		 * shouldn't precolor them::
-		 */
-		if (is_tex_or_prefetch(instr) &&
-				!(instr->regs[0]->wrmask & (1 << i)))
-			continue;
+	insert_live_out_moves(ctx);
 
-		unsigned name = scalar_name(ctx, instr, i);
-		unsigned regid = instr->regs[0]->num + i;
+	BITSET_FOREACH_SET(name, ctx->live->live_out[block->index],
+					   ctx->live->definitions_count) {
+		struct ir3_register *reg = ctx->live->definitions[name];
+		handle_live_out(ctx, reg);
+	}
 
-		if (instr->regs[0]->flags & IR3_REG_SHARED)
-			regid -= FIRST_SHARED_REG;
+	ctx->blocks[block->index].visited = true;
 
-		unsigned vreg = ctx->set->gpr_to_ra_reg[id->cls][regid];
-		ra_set_node_reg(ctx->g, name, vreg);
+	for (unsigned i = 0; i < block->dom_children_count; i++) {
+		handle_block(ctx, block->dom_children[i]);
 	}
 }
 
-/* pre-color non-scalar registers based on the registers assigned in previous
- * pass.  Do this by looking actually at the fanout instructions.
- */
-static void
-ra_precolor_assigned(struct ir3_ra_ctx *ctx)
+static unsigned
+calc_target_full_pressure(struct ir3_shader_variant *v, unsigned pressure)
 {
-	ra_assert(ctx, ctx->scalar_pass);
+	/* Registers are allocated in units of vec4, so switch from units of
+	 * half-regs to vec4.
+	 */
+	unsigned reg_count = DIV_ROUND_UP(pressure, 2 * 4);
+
+	bool double_threadsize = ir3_should_double_threadsize(v, reg_count);
+
+	unsigned target = reg_count;
+	unsigned reg_independent_max_waves =
+		ir3_get_reg_independent_max_waves(v, double_threadsize);
+	unsigned reg_dependent_max_waves =
+		ir3_get_reg_dependent_max_waves(v->shader->compiler, reg_count,
+				double_threadsize);
+	unsigned target_waves =
+		MIN2(reg_independent_max_waves, reg_dependent_max_waves);
+
+	while (target <= RA_FULL_SIZE / (2 * 4) &&
+		   ir3_should_double_threadsize(v, target) == double_threadsize &&
+		   ir3_get_reg_dependent_max_waves(v->shader->compiler, target,
+										   double_threadsize) >= target_waves)
+		target++;
+
+	return (target - 1) * 2 * 4;
+}
 
-	foreach_block (block, &ctx->ir->block_list) {
-		foreach_instr (instr, &block->instr_list) {
+int
+ir3_ra(struct ir3_shader_variant *v)
+{
+	ir3_calc_dominance(v->ir);
 
-			if (!writes_gpr(instr))
-				continue;
+	ir3_create_parallel_copies(v->ir);
 
-			if (should_assign(ctx, instr))
-				continue;
+	struct ir3_liveness *live = ir3_calc_liveness(v);
 
-			precolor(ctx, instr);
+	ir3_debug_print(v->ir, "AFTER: create_parallel_copies");
 
-			foreach_src (src, instr) {
-				if (!src->def)
-					continue;
-				precolor(ctx, src->def->instr);
-			}
-		}
-	}
-}
+	ir3_merge_regs(live, v->ir);
 
-static int
-ra_alloc(struct ir3_ra_ctx *ctx)
-{
-	if (!ra_allocate(ctx->g))
-		return -1;
+	struct ir3_pressure max_pressure;
+	ir3_calc_pressure(v, live, &max_pressure);
+	d("max pressure:");
+	d("\tfull: %u", max_pressure.full);
+	d("\thalf: %u", max_pressure.half);
+	d("\tshared: %u", max_pressure.shared);
 
-	foreach_block (block, &ctx->ir->block_list) {
-		ra_block_alloc(ctx, block);
+	if (v->mergedregs) {
+		max_pressure.full += max_pressure.half;
+		max_pressure.half = 0;
 	}
 
-	return 0;
-}
-
-/* if we end up with split/collect instructions with non-matching src
- * and dest regs, that means something has gone wrong.  Which makes it
- * a pretty good sanity check.
- */
-static void
-ra_sanity_check(struct ir3 *ir)
-{
-	foreach_block (block, &ir->block_list) {
-		foreach_instr (instr, &block->instr_list) {
-			if (instr->opc == OPC_META_SPLIT) {
-				struct ir3_register *dst = instr->regs[0];
-				struct ir3_register *src = instr->regs[1];
-				debug_assert(dst->num == (src->num + instr->split.off));
-			} else if (instr->opc == OPC_META_COLLECT) {
-				struct ir3_register *dst = instr->regs[0];
-
-				foreach_src_n (src, n, instr) {
-					debug_assert(dst->num == (src->num - n));
-				}
-			}
-		}
+	if (max_pressure.full > RA_FULL_SIZE ||
+		max_pressure.half > RA_HALF_SIZE ||
+		max_pressure.shared > RA_SHARED_SIZE) {
+		d("max pressure exceeded!");
+		return 1;
 	}
-}
-
-static int
-ir3_ra_pass(struct ir3_shader_variant *v, struct ir3_instruction **precolor,
-		unsigned nprecolor, bool scalar_pass)
-{
-	struct ir3_ra_ctx ctx = {
-			.v = v,
-			.ir = v->ir,
-			.set = v->mergedregs ?
-				v->ir->compiler->mergedregs_set : v->ir->compiler->set,
-			.scalar_pass = scalar_pass,
-	};
-	int ret;
 
-	ret = setjmp(ctx.jmp_env);
-	if (ret)
-		goto fail;
+	struct ra_ctx *ctx = rzalloc(NULL, struct ra_ctx);
 
-	ra_init(&ctx);
-	ra_add_interference(&ctx);
-	ra_precolor(&ctx, precolor, nprecolor);
-	if (scalar_pass)
-		ra_precolor_assigned(&ctx);
-	ret = ra_alloc(&ctx);
+	ctx->merged_regs = v->mergedregs;
+	ctx->compiler = v->shader->compiler;
+	ctx->stage = v->type;
+	ctx->live = live;
+	ctx->intervals = rzalloc_array(ctx, struct ra_interval, live->definitions_count);
+	ctx->blocks = rzalloc_array(ctx, struct ra_block_state, live->block_count);
 
-fail:
-	ra_destroy(&ctx);
+	ctx->full.size = calc_target_full_pressure(v, max_pressure.full);
+	d("full size: %u", ctx->full.size);
+	
+	if (!v->mergedregs)
+		ctx->half.size = RA_HALF_SIZE;
 
-	return ret;
-}
+	ctx->shared.size = RA_SHARED_SIZE;
 
-int
-ir3_ra(struct ir3_shader_variant *v, struct ir3_instruction **precolor,
-		unsigned nprecolor)
-{
-	int ret;
+	handle_block(ctx, ir3_start_block(v->ir));
 
-	/* First pass, assign the vecN (non-scalar) registers: */
-	ret = ir3_ra_pass(v, precolor, nprecolor, false);
-	if (ret)
-		return ret;
+	/* Strip array-ness and SSA-ness at the end, because various helpers still
+	 * need to work even on definitions that have already been assigned. For
+	 * example, we need to preserve array-ness so that array live-ins have the
+	 * right size.
+	 */
+	foreach_block (block, &v->ir->block_list) {
+		foreach_instr (instr, &block->instr_list) {
+			for (unsigned i = 0; i < instr->regs_count; i++) {
+				instr->regs[i]->flags &= ~IR3_REG_SSA;
 
-	ir3_debug_print(v->ir, "AFTER: ir3_ra (1st pass)");
+				/* Parallel copies of array registers copy the whole register,
+				 * and we need some way to let the parallel copy code know
+				 * that this was an array whose size is determined by
+				 * reg->size. So keep the array flag on those.
+				 */
+				if (!is_meta(instr))
+					instr->regs[i]->flags &= ~IR3_REG_ARRAY;
+			}
+		}
+	}
 
-	/* Second pass, assign the scalar registers: */
-	ret = ir3_ra_pass(v, precolor, nprecolor, true);
-	if (ret)
-		return ret;
+	ir3_debug_print(v->ir, "AFTER: register allocation");
 
-	ir3_debug_print(v->ir, "AFTER: ir3_ra (2st pass)");
+	ir3_lower_copies(v);
 
-#ifdef DEBUG
-#  define SANITY_CHECK DEBUG
-#else
-#  define SANITY_CHECK 0
-#endif
-	if (SANITY_CHECK)
-		ra_sanity_check(v->ir);
+	ir3_debug_print(v->ir, "AFTER: ir3_lower_copies");
 
-	return ret;
+	ralloc_free(ctx);
+	ralloc_free(live);
+	return 0;
 }
+
diff --git a/src/freedreno/ir3/ir3_ra.h b/src/freedreno/ir3/ir3_ra.h
index 34f2549..672b536 100644
--- a/src/freedreno/ir3/ir3_ra.h
+++ b/src/freedreno/ir3/ir3_ra.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ * Copyright (C) 2021 Valve Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -19,361 +19,282 @@
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
  */
 
-#ifndef IR3_RA_H_
-#define IR3_RA_H_
+#ifndef _IR3_RA_H
+#define _IR3_RA_H
 
-#include <setjmp.h>
+#include "ir3.h"
+#include "ir3_compiler.h"
+#include "util/rb_tree.h"
 
-#include "util/bitset.h"
+#ifdef DEBUG
+#define RA_DEBUG (ir3_shader_debug & IR3_DBG_RAMSGS)
+#else
+#define RA_DEBUG 0
+#endif
+#define d(fmt, ...) do { if (RA_DEBUG) { \
+	printf("RA: "fmt"\n", ##__VA_ARGS__); \
+} } while (0)
 
+#define di(instr, fmt, ...) do { if (RA_DEBUG) { \
+	printf("RA: "fmt": ", ##__VA_ARGS__); \
+	ir3_print_instr(instr); \
+} } while (0)
 
-static const unsigned class_sizes[] = {
-	1, 2, 3, 4,
-	4 + 4, /* txd + 1d/2d */
-	4 + 6, /* txd + 3d */
-};
-#define class_count ARRAY_SIZE(class_sizes)
+typedef uint16_t physreg_t;
 
-static const unsigned half_class_sizes[] = {
-	1, 2, 3, 4,
-};
-#define half_class_count  ARRAY_SIZE(half_class_sizes)
+static inline unsigned
+ra_physreg_to_num(physreg_t physreg, unsigned flags)
+{
+	if (!(flags & IR3_REG_HALF))
+		physreg /= 2;
+	if (flags & IR3_REG_SHARED)
+		physreg += 48 * 4;
+	return physreg;
+}
 
-/* seems to just be used for compute shaders?  Seems like vec1 and vec3
- * are sufficient (for now?)
- */
-static const unsigned shared_class_sizes[] = {
-	1, 3,
-};
-#define shared_class_count ARRAY_SIZE(shared_class_sizes)
+static inline physreg_t
+ra_num_to_physreg(unsigned num, unsigned flags)
+{
+	if (flags & IR3_REG_SHARED)
+		num -= 48 * 4;
+	if (!(flags & IR3_REG_HALF))
+		num *= 2;
+	return num;
+}
 
-#define total_class_count (class_count + half_class_count + shared_class_count)
+static inline unsigned
+ra_reg_get_num(const struct ir3_register *reg)
+{
+	return (reg->flags & IR3_REG_ARRAY) ? reg->array.base : reg->num;
+}
 
-/* Below a0.x are normal regs.  RA doesn't need to assign a0.x/p0.x. */
-#define NUM_REGS             (4 * 48)  /* r0 to r47 */
-#define NUM_SHARED_REGS      (4 * 8)   /* r48 to r55 */
-#define FIRST_SHARED_REG     (4 * 48)
-/* Number of virtual regs in a given class: */
+static inline physreg_t
+ra_reg_get_physreg(const struct ir3_register *reg)
+{
+	return ra_num_to_physreg(ra_reg_get_num(reg), reg->flags);
+}
 
-static inline unsigned CLASS_REGS(unsigned i)
+static inline bool
+def_is_gpr(const struct ir3_register *reg)
 {
-	assert(i < class_count);
+	return reg_num(reg) != REG_A0 && reg_num(reg) != REG_P0;
+}
 
-	return (NUM_REGS - (class_sizes[i] - 1));
+/* Note: don't count undef as a source.
+ */
+static inline bool
+ra_reg_is_src(const struct ir3_register *reg)
+{
+	return (reg->flags & IR3_REG_SSA) && reg->def &&
+		def_is_gpr(reg->def);
 }
 
-static inline unsigned HALF_CLASS_REGS(unsigned i)
+/* Array destinations can act as a source, reading the previous array and then
+ * modifying it. Return true when the register is an array destination that
+ * acts like a source.
+ */
+static inline bool
+ra_reg_is_array_rmw(const struct ir3_register *reg)
 {
-	assert(i < half_class_count);
+	return ((reg->flags & IR3_REG_ARRAY) && (reg->flags & IR3_REG_DEST) && reg->def);
+}
 
-	return (NUM_REGS - (half_class_sizes[i] - 1));
+static inline bool
+ra_reg_is_dst(const struct ir3_register *reg)
+{
+	return (reg->flags & IR3_REG_SSA) && (reg->flags & IR3_REG_DEST) &&
+		def_is_gpr(reg) &&
+		((reg->flags & IR3_REG_ARRAY) || reg->wrmask);
 }
 
-static inline unsigned SHARED_CLASS_REGS(unsigned i)
+static inline struct ir3_register *
+ra_dst_get_tied_src(const struct ir3_compiler *compiler, struct ir3_register *dst)
 {
-	assert(i < shared_class_count);
+	/* With the a6xx new cat6 encoding, the same register is used for the
+	 * value and destination of atomic operations.
+	 */
+	if (compiler->gpu_id >= 600 && is_atomic(dst->instr->opc) &&
+		(dst->instr->flags & IR3_INSTR_G)) {
+		return dst->instr->regs[3];
+	}
 
-	return (NUM_SHARED_REGS - (shared_class_sizes[i] - 1));
+	return NULL;
 }
 
-#define HALF_OFFSET          (class_count)
-#define SHARED_OFFSET        (class_count + half_class_count)
+/* Iterators for sources and destinations which:
+ * - Don't include fake sources (irrelevant for RA)
+ * - Don't include non-SSA sources (immediates and constants, also irrelevant)
+ * - Consider array destinations as both a source and a destination
+ */
 
-/* register-set, created one time, used for all shaders: */
-struct ir3_ra_reg_set {
-	struct ra_regs *regs;
-	struct ra_class *classes[class_count];
-	struct ra_class *half_classes[half_class_count];
-	struct ra_class *shared_classes[shared_class_count];
+#define ra_foreach_src(__srcreg, __instr) \
+	for (struct ir3_register *__srcreg = (void *)~0; __srcreg; __srcreg = NULL) \
+		for (unsigned __cnt = (__instr)->regs_count, __i = 0; __i < __cnt; __i++) \
+			if (ra_reg_is_src((__srcreg = (__instr)->regs[__i])))
+
+#define ra_foreach_src_rev(__srcreg, __instr) \
+	for (struct ir3_register *__srcreg = (void *)~0; __srcreg; __srcreg = NULL) \
+		for (int __cnt = (__instr)->regs_count, __i = __cnt - 1; __i >= 0; __i--) \
+			if (ra_reg_is_src((__srcreg = (__instr)->regs[__i])))
+
+#define ra_foreach_dst(__srcreg, __instr) \
+	for (struct ir3_register *__srcreg = (void *)~0; __srcreg; __srcreg = NULL) \
+		for (unsigned __cnt = (__instr)->regs_count, __i = 0; __i < __cnt; __i++) \
+			if (ra_reg_is_dst((__srcreg = (__instr)->regs[__i])))
+
+static inline struct ir3_register *
+ra_src_get_tied_dst(const struct ir3_compiler *compiler,
+					struct ir3_instruction *instr,
+				    struct ir3_register *src)
+{
+	if (compiler->gpu_id >= 600 && is_atomic(instr->opc) &&
+		(instr->flags & IR3_INSTR_G) && src == instr->regs[3]) {
+		return instr->regs[0];
+	}
 
-	/* pre-fetched tex dst is limited, on current gens to regs
-	 * 0x3f and below.  An additional register class, with one
-	 * vreg, that is setup to conflict with any regs above that
-	 * limit.
-	 */
-	struct ra_class *prefetch_exclude_class;
-	unsigned prefetch_exclude_reg;
-
-	/* The virtual register space flattens out all the classes,
-	 * starting with full, followed by half and then shared, ie:
-	 *
-	 *   scalar full  (starting at zero)
-	 *   vec2 full
-	 *   vec3 full
-	 *   ...
-	 *   vecN full
-	 *   scalar half  (starting at first_half_reg)
-	 *   vec2 half
-	 *   ...
-	 *   vecN half
-	 *   scalar shared  (starting at first_shared_reg)
-	 *   ...
-	 *   vecN shared
-	 *
-	 */
-	unsigned first_half_reg, first_shared_reg;
+	return NULL;
+}
 
-	/* maps flat virtual register space to base gpr: */
-	uint16_t *ra_reg_to_gpr;
-	/* maps cls,gpr to flat virtual register space: */
-	uint16_t **gpr_to_ra_reg;
-};
 
-/* additional block-data (per-block) */
-struct ir3_ra_block_data {
-	BITSET_WORD *def;        /* variables defined before used in block */
-	BITSET_WORD *use;        /* variables used before defined in block */
-	BITSET_WORD *livein;     /* which defs reach entry point of block */
-	BITSET_WORD *liveout;    /* which defs reach exit point of block */
-};
+#define RA_HALF_SIZE (4 * 48)
+#define RA_FULL_SIZE (4 * 48 * 2)
+#define RA_SHARED_SIZE (2 * 4 * 8)
+#define RA_MAX_FILE_SIZE RA_FULL_SIZE
 
-/* additional instruction-data (per-instruction) */
-struct ir3_ra_instr_data {
-	/* cached instruction 'definer' info: */
-	struct ir3_instruction *defn;
-	int off, sz, cls;
+struct ir3_liveness {
+	unsigned block_count;
+	DECLARE_ARRAY(struct ir3_register *, definitions);
+	DECLARE_ARRAY(BITSET_WORD *, live_out);
+	DECLARE_ARRAY(BITSET_WORD *, live_in);
 };
 
-/* register-assign context, per-shader */
-struct ir3_ra_ctx {
-	struct ir3_shader_variant *v;
-	struct ir3 *ir;
+struct ir3_liveness *ir3_calc_liveness(struct ir3_shader_variant *v);
 
-	struct ir3_ra_reg_set *set;
-	struct ra_graph *g;
+bool ir3_def_live_after(struct ir3_liveness *live, struct ir3_register *def,
+						struct ir3_instruction *instr);
 
-	/* Are we in the scalar assignment pass?  In this pass, all larger-
-	 * than-vec1 vales have already been assigned and pre-colored, so
-	 * we only consider scalar values.
-	 */
-	bool scalar_pass;
-
-	unsigned alloc_count;
-	unsigned r0_xyz_nodes; /* ra node numbers for r0.[xyz] precolors */
-	unsigned hr0_xyz_nodes; /* ra node numbers for hr0.[xyz] precolors */
-	unsigned prefetch_exclude_node;
-	/* one per class, plus one slot for arrays: */
-	unsigned class_alloc_count[total_class_count + 1];
-	unsigned class_base[total_class_count + 1];
-	unsigned instr_cnt;
-	unsigned *def, *use;     /* def/use table */
-	struct ir3_ra_instr_data *instrd;
-
-	/* Mapping vreg name back to instruction, used select reg callback: */
-	struct hash_table *name_to_instr;
-
-	/* Tracking for select_reg callback */
-	unsigned start_search_reg;
-	unsigned max_target;
-
-	/* Temporary buffer for def/use iterators
-	 *
-	 * The worst case should probably be an array w/ relative access (ie.
-	 * all elements are def'd or use'd), and that can't be larger than
-	 * the number of registers.
-	 *
-	 * NOTE we could declare this on the stack if needed, but I don't
-	 * think there is a need for nested iterators.
-	 */
-	unsigned namebuf[NUM_REGS];
-	unsigned namecnt, nameidx;
+void ir3_create_parallel_copies(struct ir3 *ir);
+
+void ir3_merge_regs(struct ir3_liveness *live, struct ir3 *ir);
 
-	/* Error handling: */
-	jmp_buf jmp_env;
+struct ir3_pressure {
+	unsigned full, half, shared;
 };
 
-#define ra_assert(ctx, expr) do { \
-		if (!(expr)) { \
-			_debug_printf("RA: %s:%u: %s: Assertion `%s' failed.\n", __FILE__, __LINE__, __func__, #expr); \
-			longjmp((ctx)->jmp_env, -1); \
-		} \
-	} while (0)
-#define ra_unreachable(ctx, str) ra_assert(ctx, !str)
+void ir3_calc_pressure(struct ir3_shader_variant *v,
+					   struct ir3_liveness *live,
+					   struct ir3_pressure *max_pressure);
 
-static inline int
-ra_name(struct ir3_ra_ctx *ctx, struct ir3_ra_instr_data *id)
-{
-	unsigned name;
-	debug_assert(id->cls >= 0);
-	debug_assert(id->cls < total_class_count);  /* we shouldn't get arrays here.. */
-	name = ctx->class_base[id->cls] + id->defn->name;
-	debug_assert(name < ctx->alloc_count);
-	return name;
-}
+void ir3_lower_copies(struct ir3_shader_variant *v);
 
-/* Get the scalar name of the n'th component of an instruction dst: */
-static inline int
-scalar_name(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr, unsigned n)
-{
-	if (ctx->scalar_pass) {
-		if (instr->opc == OPC_META_SPLIT) {
-			debug_assert(n == 0);     /* split results in a scalar */
-			struct ir3_instruction *src = instr->regs[1]->def->instr;
-			return scalar_name(ctx, src, instr->split.off);
-		} else if (instr->opc == OPC_META_COLLECT) {
-			debug_assert(n < (instr->regs_count + 1));
-			struct ir3_instruction *src = instr->regs[n + 1]->def->instr;
-			return scalar_name(ctx, src, 0);
-		}
-	} else {
-		debug_assert(n == 0);
-	}
+/* Register interval datastructure
+ *
+ * ir3_reg_ctx is used to track which registers are live. The tricky part is
+ * that some registers may overlap each other, when registers with overlapping
+ * live ranges get coalesced. For example, splits will overlap with their
+ * parent vector and sometimes collect sources will also overlap with the
+ * collect'ed vector. ir3_merge_regs guarantees for us that none of the
+ * registers in a merge set that are live at any given point partially
+ * overlap, which means that we can organize them into a forest. While each
+ * register has a per-merge-set offset, ir3_merge_regs also computes a
+ * "global" offset which allows us to throw away the original merge sets and
+ * think of registers as just intervals in a forest of live intervals. When a
+ * register becomes live, we insert it into the forest, and when it dies we
+ * remove it from the forest (and then its children get moved up a level). We
+ * use red-black trees to keep track of each level of the forest, so insertion
+ * and deletion should be fast operations. ir3_reg_ctx handles all the
+ * internal bookkeeping for this, so that it can be shared between RA,
+ * spilling, and register pressure tracking.
+ */
 
-	return ra_name(ctx, &ctx->instrd[instr->ip]) + n;
-}
+struct ir3_reg_interval {
+	struct rb_node node;
 
-#define NO_NAME ~0
+	struct rb_tree children;
 
-/*
- * Iterators to iterate the vreg names of an instructions def's and use's
- */
+	struct ir3_reg_interval *parent;
 
-static inline unsigned
-__ra_name_cnt(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr)
-{
-	if (!instr)
-		return 0;
+	struct ir3_register *reg;
+
+	bool inserted;
+};
 
-	/* Filter special cases, ie. writes to a0.x or p0.x, or non-ssa: */
-	if (!writes_gpr(instr) || (instr->regs[0]->flags & IR3_REG_ARRAY))
-		return 0;
+struct ir3_reg_ctx {
+	/* The tree of top-level intervals in the forest. */
+	struct rb_tree intervals;
 
-	/* in scalar pass, we aren't considering virtual register classes, ie.
-	 * if an instruction writes a vec2, then it defines two different scalar
-	 * register names.
+	/* Users of ir3_reg_ctx need to keep around additional state that is
+	 * modified when top-level intervals are added or removed. For register
+	 * pressure tracking, this is just the register pressure, but for RA we
+	 * need to keep track of the physreg of each top-level interval. These
+	 * callbacks provide a place to let users deriving from ir3_reg_ctx update
+	 * their state when top-level intervals are inserted/removed.
 	 */
-	if (ctx->scalar_pass)
-		return dest_regs(instr);
 
-	return 1;
-}
+	/* Called when an interval is added and it turns out to be at the top
+	 * level.
+	 */
+	void (*interval_add)(struct ir3_reg_ctx *ctx,
+						 struct ir3_reg_interval *interval);
 
-#define foreach_name_n(__name, __n, __ctx, __instr) \
-	for (unsigned __cnt = __ra_name_cnt(__ctx, __instr), __n = 0, __name; \
-	     (__n < __cnt) && ({__name = scalar_name(__ctx, __instr, __n); 1;}); __n++)
+	/* Called when an interval is deleted from the top level. */
+	void (*interval_delete)(struct ir3_reg_ctx *ctx,
+							struct ir3_reg_interval *interval);
 
-#define foreach_name(__name, __ctx, __instr) \
-	foreach_name_n(__name, __n, __ctx, __instr)
+	/* Called when an interval is deleted and its child becomes top-level.
+	 */
+	void (*interval_readd)(struct ir3_reg_ctx *ctx,
+						   struct ir3_reg_interval *parent,
+						   struct ir3_reg_interval *child);
+};
 
-static inline unsigned
-__ra_itr_pop(struct ir3_ra_ctx *ctx)
+static inline struct ir3_reg_interval *
+ir3_rb_node_to_interval(struct rb_node *node)
 {
-	if (ctx->nameidx < ctx->namecnt)
-		return ctx->namebuf[ctx->nameidx++];
-	return NO_NAME;
+	return rb_node_data(struct ir3_reg_interval, node, node);
 }
 
-static inline void
-__ra_itr_push(struct ir3_ra_ctx *ctx, unsigned name)
+static inline const struct ir3_reg_interval *
+ir3_rb_node_to_interval_const(const struct rb_node *node)
 {
-	assert(ctx->namecnt < ARRAY_SIZE(ctx->namebuf));
-	ctx->namebuf[ctx->namecnt++] = name;
+	return rb_node_data(struct ir3_reg_interval, node, node);
 }
 
-static inline unsigned
-__ra_init_def_itr(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr)
+static inline struct ir3_reg_interval *
+ir3_reg_interval_next(struct ir3_reg_interval *interval)
 {
-	/* nested use is not supported: */
-	assert(ctx->namecnt == ctx->nameidx);
-
-	ctx->namecnt = ctx->nameidx = 0;
-
-	if (!writes_gpr(instr))
-		return NO_NAME;
-
-	struct ir3_ra_instr_data *id = &ctx->instrd[instr->ip];
-	struct ir3_register *dst = instr->regs[0];
-
-	if (dst->flags & IR3_REG_ARRAY) {
-		struct ir3_array *arr = ir3_lookup_array(ctx->ir, dst->array.id);
-
-		/* indirect write is treated like a write to all array
-		 * elements, since we don't know which one is actually
-		 * written:
-		 */
-		if (dst->flags & IR3_REG_RELATIV) {
-			for (unsigned i = 0; i < arr->length; i++) {
-				__ra_itr_push(ctx, arr->base + i);
-			}
-		} else {
-			__ra_itr_push(ctx, arr->base + dst->array.offset);
-			debug_assert(dst->array.offset < arr->length);
-		}
-	} else if (id->defn == instr) {
-		foreach_name_n (name, i, ctx, instr) {
-			/* tex instructions actually have a wrmask, and
-			 * don't touch masked out components.  We can't do
-			 * anything useful about that in the first pass,
-			 * but in the scalar pass we can realize these
-			 * registers are available:
-			 */
-			if (ctx->scalar_pass && is_tex_or_prefetch(instr) &&
-					!(instr->regs[0]->wrmask & (1 << i)))
-				continue;
-			__ra_itr_push(ctx, name);
-		}
-	}
-
-	return __ra_itr_pop(ctx);
+	struct rb_node *next = rb_node_next(&interval->node);
+	return next ? ir3_rb_node_to_interval(next) : NULL;
 }
 
-static inline unsigned
-__ra_init_use_itr(struct ir3_ra_ctx *ctx, struct ir3_instruction *instr)
+static inline struct ir3_reg_interval *
+ir3_reg_interval_next_or_null(struct ir3_reg_interval *interval)
 {
-	/* nested use is not supported: */
-	assert(ctx->namecnt == ctx->nameidx);
-
-	ctx->namecnt = ctx->nameidx = 0;
-
-	foreach_src (reg, instr) {
-		if (reg->flags & IR3_REG_ARRAY) {
-			struct ir3_array *arr =
-				ir3_lookup_array(ctx->ir, reg->array.id);
-
-			/* indirect read is treated like a read from all array
-			 * elements, since we don't know which one is actually
-			 * read:
-			 */
-			if (reg->flags & IR3_REG_RELATIV) {
-				for (unsigned i = 0; i < arr->length; i++) {
-					__ra_itr_push(ctx, arr->base + i);
-				}
-			} else {
-				__ra_itr_push(ctx, arr->base + reg->array.offset);
-				debug_assert(reg->array.offset < arr->length);
-			}
-		} else if (reg->def) {
-			foreach_name_n (name, i, ctx, reg->def->instr) {
-				/* split takes a src w/ wrmask potentially greater
-				 * than 0x1, but it really only cares about a single
-				 * component.  This shows up in splits coming out of
-				 * a tex instruction w/ wrmask=.z, for example.
-				 */
-				if (ctx->scalar_pass && (instr->opc == OPC_META_SPLIT) &&
-						!(i == instr->split.off))
-					continue;
-				__ra_itr_push(ctx, name);
-			}
-		}
-	}
+	return interval ? ir3_reg_interval_next(interval) : NULL;
+}
 
-	return __ra_itr_pop(ctx);
+static inline void
+ir3_reg_interval_init(struct ir3_reg_interval *interval, struct ir3_register *reg)
+{
+	rb_tree_init(&interval->children);
+	interval->reg = reg;
+	interval->parent = NULL;
+	interval->inserted = false;
 }
 
-#define foreach_def(__name, __ctx, __instr) \
-	for (unsigned __name = __ra_init_def_itr(__ctx, __instr); \
-	     __name != NO_NAME; __name = __ra_itr_pop(__ctx))
+void
+ir3_reg_interval_dump(struct ir3_reg_interval *interval);
+
+void ir3_reg_interval_insert(struct ir3_reg_ctx *ctx,
+							 struct ir3_reg_interval *interval);
+
+void ir3_reg_interval_remove(struct ir3_reg_ctx *ctx,
+							 struct ir3_reg_interval *interval);
 
-#define foreach_use(__name, __ctx, __instr) \
-	for (unsigned __name = __ra_init_use_itr(__ctx, __instr); \
-	     __name != NO_NAME; __name = __ra_itr_pop(__ctx))
+void ir3_reg_interval_remove_all(struct ir3_reg_ctx *ctx,
+								 struct ir3_reg_interval *interval);
 
-int ra_size_to_class(unsigned sz, bool half, bool shared);
-int ra_class_to_size(unsigned class, bool *half, bool *shared);
+#endif
 
-#endif  /* IR3_RA_H_ */
diff --git a/src/freedreno/ir3/ir3_ra_regset.c b/src/freedreno/ir3/ir3_ra_regset.c
deleted file mode 100644
index d7290bf..0000000
--- a/src/freedreno/ir3/ir3_ra_regset.c
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
- * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include "util/u_math.h"
-#include "util/register_allocate.h"
-#include "util/ralloc.h"
-#include "util/bitset.h"
-
-#include "ir3.h"
-#include "ir3_compiler.h"
-#include "ir3_ra.h"
-
-static void
-setup_conflicts(struct ir3_ra_reg_set *set)
-{
-	unsigned reg;
-
-	reg = 0;
-	for (unsigned i = 0; i < class_count; i++) {
-		for (unsigned j = 0; j < CLASS_REGS(i); j++) {
-			for (unsigned br = j; br < j + class_sizes[i]; br++) {
-				ra_add_transitive_reg_conflict(set->regs, br, reg);
-			}
-
-			reg++;
-		}
-	}
-
-	for (unsigned i = 0; i < half_class_count; i++) {
-		for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) {
-			for (unsigned br = j; br < j + half_class_sizes[i]; br++) {
-				ra_add_transitive_reg_conflict(set->regs,
-						br + set->first_half_reg, reg);
-			}
-
-			reg++;
-		}
-	}
-
-	for (unsigned i = 0; i < shared_class_count; i++) {
-		for (unsigned j = 0; j < SHARED_CLASS_REGS(i); j++) {
-			for (unsigned br = j; br < j + shared_class_sizes[i]; br++) {
-				ra_add_transitive_reg_conflict(set->regs,
-						br + set->first_shared_reg, reg);
-			}
-
-			reg++;
-		}
-	}
-
-	/*
-	 * Setup conflicts with registers over 0x3f for the special vreg
-	 * that exists to use as interference for tex-prefetch:
-	 */
-
-	for (unsigned i = 0x40; i < CLASS_REGS(0); i++) {
-		ra_add_transitive_reg_conflict(set->regs, i,
-				set->prefetch_exclude_reg);
-	}
-
-	for (unsigned i = 0x40; i < HALF_CLASS_REGS(0); i++) {
-		ra_add_transitive_reg_conflict(set->regs, i + set->first_half_reg,
-				set->prefetch_exclude_reg);
-	}
-}
-
-/* One-time setup of RA register-set, which describes all the possible
- * "virtual" registers and their interferences.  Ie. double register
- * occupies (and conflicts with) two single registers, and so forth.
- * Since registers do not need to be aligned to their class size, they
- * can conflict with other registers in the same class too.  Ie:
- *
- *    Single (base) |  Double
- *    --------------+---------------
- *       R0         |  D0
- *       R1         |  D0 D1
- *       R2         |     D1 D2
- *       R3         |        D2
- *           .. and so on..
- *
- * (NOTE the disassembler uses notation like r0.x/y/z/w but those are
- * really just four scalar registers.  Don't let that confuse you.)
- */
-struct ir3_ra_reg_set *
-ir3_ra_alloc_reg_set(struct ir3_compiler *compiler, bool mergedregs)
-{
-	struct ir3_ra_reg_set *set = rzalloc(compiler, struct ir3_ra_reg_set);
-	unsigned ra_reg_count, reg, base;
-
-	/* calculate # of regs across all classes: */
-	ra_reg_count = 0;
-	for (unsigned i = 0; i < class_count; i++)
-		ra_reg_count += CLASS_REGS(i);
-	for (unsigned i = 0; i < half_class_count; i++)
-		ra_reg_count += HALF_CLASS_REGS(i);
-	for (unsigned i = 0; i < shared_class_count; i++)
-		ra_reg_count += SHARED_CLASS_REGS(i);
-
-	ra_reg_count += 1;   /* for tex-prefetch excludes */
-
-	/* allocate the reg-set.. */
-	set->regs = ra_alloc_reg_set(set, ra_reg_count, true);
-	set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count);
-	set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count);
-
-	/* .. and classes */
-	reg = 0;
-	for (unsigned i = 0; i < class_count; i++) {
-		set->classes[i] = ra_alloc_reg_class(set->regs);
-
-		set->gpr_to_ra_reg[i] = ralloc_array(set, uint16_t, CLASS_REGS(i));
-
-		for (unsigned j = 0; j < CLASS_REGS(i); j++) {
-			ra_class_add_reg(set->classes[i], reg);
-
-			set->ra_reg_to_gpr[reg] = j;
-			set->gpr_to_ra_reg[i][j] = reg;
-
-			reg++;
-		}
-	}
-
-	set->first_half_reg = reg;
-	base = HALF_OFFSET;
-
-	for (unsigned i = 0; i < half_class_count; i++) {
-		set->half_classes[i] = ra_alloc_reg_class(set->regs);
-
-		set->gpr_to_ra_reg[base + i] =
-				ralloc_array(set, uint16_t, HALF_CLASS_REGS(i));
-
-		for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) {
-			ra_class_add_reg(set->half_classes[i], reg);
-
-			set->ra_reg_to_gpr[reg] = j;
-			set->gpr_to_ra_reg[base + i][j] = reg;
-
-			reg++;
-		}
-	}
-
-	set->first_shared_reg = reg;
-	base = SHARED_OFFSET;
-
-	for (unsigned i = 0; i < shared_class_count; i++) {
-		set->shared_classes[i] = ra_alloc_reg_class(set->regs);
-
-		set->gpr_to_ra_reg[base + i] =
-				ralloc_array(set, uint16_t, SHARED_CLASS_REGS(i));
-
-		for (unsigned j = 0; j < SHARED_CLASS_REGS(i); j++) {
-			ra_class_add_reg(set->shared_classes[i], reg);
-
-			set->ra_reg_to_gpr[reg] = j;
-			set->gpr_to_ra_reg[base + i][j] = reg;
-
-			reg++;
-		}
-	}
-
-	/*
-	 * Setup an additional class, with one vreg, to simply conflict
-	 * with registers that are too high to encode tex-prefetch.  This
-	 * vreg is only used to setup additional conflicts so that RA
-	 * knows to allocate prefetch dst regs below the limit:
-	 */
-	set->prefetch_exclude_class = ra_alloc_reg_class(set->regs);
-	ra_class_add_reg(set->prefetch_exclude_class, reg);
-	set->prefetch_exclude_reg = reg++;
-
-	/*
-	 * And finally setup conflicts.  Starting a6xx, half precision regs
-	 * conflict w/ full precision regs (when using MERGEDREGS):
-	 */
-	if (mergedregs) {
-		for (unsigned i = 0; i < CLASS_REGS(0) / 2; i++) {
-			unsigned freg  = set->gpr_to_ra_reg[0][i];
-			unsigned hreg0 = set->gpr_to_ra_reg[0 + HALF_OFFSET][(i * 2) + 0];
-			unsigned hreg1 = set->gpr_to_ra_reg[0 + HALF_OFFSET][(i * 2) + 1];
-
-			ra_add_transitive_reg_pair_conflict(set->regs, freg, hreg0, hreg1);
-		}
-	}
-
-	setup_conflicts(set);
-
-	ra_set_finalize(set->regs, NULL);
-
-	return set;
-}
-
-int
-ra_size_to_class(unsigned sz, bool half, bool shared)
-{
-	if (shared) {
-		for (unsigned i = 0; i < shared_class_count; i++)
-			if (shared_class_sizes[i] >= sz)
-				return i + SHARED_OFFSET;
-	} else if (half) {
-		for (unsigned i = 0; i < half_class_count; i++)
-			if (half_class_sizes[i] >= sz)
-				return i + HALF_OFFSET;
-	} else {
-		for (unsigned i = 0; i < class_count; i++)
-			if (class_sizes[i] >= sz)
-				return i;
-	}
-	debug_assert(0);
-	return -1;
-}
-
-int
-ra_class_to_size(unsigned class, bool *half, bool *shared)
-{
-	*half = *shared = false;
-
-	if (class >= SHARED_OFFSET) {
-		*shared = true;
-		return shared_class_sizes[class - SHARED_OFFSET];
-	} else if (class >= HALF_OFFSET) {
-		*half = true;
-		return half_class_sizes[class - HALF_OFFSET];
-	} else {
-		return class_sizes[class];
-	}
-}
diff --git a/src/freedreno/ir3/ir3_spill.c b/src/freedreno/ir3/ir3_spill.c
new file mode 100644
index 0000000..7342c9b
--- /dev/null
+++ b/src/freedreno/ir3/ir3_spill.c
@@ -0,0 +1,362 @@
+/*
+ * Copyright (C) 2021 Valve Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ir3_ra.h"
+#include "ir3_shader.h"
+#include "util/rb_tree.h"
+
+/*
+ * This pass does one thing so far:
+ *
+ * 1. Calculates the maximum register pressure. To do this, we need to use the
+ * exact same technique that RA uses for combining meta_split instructions
+ * with their sources, so that our calculation agrees with RA.
+ *
+ * It will also optionally spill registers once that's implemented.
+ */
+
+struct ra_spill_interval {
+	struct ir3_reg_interval interval;
+};
+
+struct ra_spill_ctx {
+	struct ir3_reg_ctx reg_ctx;
+
+	struct ra_spill_interval *intervals;
+
+	struct ir3_pressure cur_pressure, max_pressure;
+
+	struct ir3_liveness *live;
+
+	const struct ir3_compiler *compiler;
+};
+
+static void
+ra_spill_interval_init(struct ra_spill_interval *interval, struct ir3_register *reg)
+{
+	ir3_reg_interval_init(&interval->interval, reg);
+}
+
+static void
+ra_pressure_add(struct ir3_pressure *pressure, struct ra_spill_interval *interval)
+{
+	unsigned size = reg_size(interval->interval.reg);
+	if (interval->interval.reg->flags & IR3_REG_SHARED)
+		pressure->shared += size;
+	else if (interval->interval.reg->flags & IR3_REG_HALF)
+		pressure->half += size;
+	else
+		pressure->full += size;
+}
+
+static void
+ra_pressure_sub(struct ir3_pressure *pressure, struct ra_spill_interval *interval)
+{
+	unsigned size = reg_size(interval->interval.reg);
+	if (interval->interval.reg->flags & IR3_REG_SHARED)
+		pressure->shared -= size;
+	else if (interval->interval.reg->flags & IR3_REG_HALF)
+		pressure->half -= size;
+	else
+		pressure->full -= size;
+}
+
+static struct ra_spill_interval *
+ir3_reg_interval_to_interval(struct ir3_reg_interval *interval)
+{
+	return rb_node_data(struct ra_spill_interval, interval, interval);
+}
+
+static struct ra_spill_ctx *
+ir3_reg_ctx_to_ctx(struct ir3_reg_ctx *ctx)
+{
+	return rb_node_data(struct ra_spill_ctx, ctx, reg_ctx);
+}
+
+static void
+interval_add(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
+{
+	struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
+	struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
+
+	ra_pressure_add(&ctx->cur_pressure, interval);
+}
+
+static void
+interval_delete(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_interval)
+{
+	struct ra_spill_interval *interval = ir3_reg_interval_to_interval(_interval);
+	struct ra_spill_ctx *ctx = ir3_reg_ctx_to_ctx(_ctx);
+
+	ra_pressure_sub(&ctx->cur_pressure, interval);
+}
+
+static void
+interval_readd(struct ir3_reg_ctx *_ctx, struct ir3_reg_interval *_parent,
+			   struct ir3_reg_interval *_child)
+{
+	interval_add(_ctx, _child);
+}
+
+static void
+spill_ctx_init(struct ra_spill_ctx *ctx)
+{
+	rb_tree_init(&ctx->reg_ctx.intervals);
+	ctx->reg_ctx.interval_add = interval_add;
+	ctx->reg_ctx.interval_delete = interval_delete;
+	ctx->reg_ctx.interval_readd = interval_readd;
+}
+
+static void
+ra_spill_ctx_insert(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval)
+{
+	ir3_reg_interval_insert(&ctx->reg_ctx, &interval->interval);
+}
+
+static void
+ra_spill_ctx_remove(struct ra_spill_ctx *ctx, struct ra_spill_interval *interval)
+{
+	ir3_reg_interval_remove(&ctx->reg_ctx, &interval->interval);
+}
+
+static void
+init_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
+{
+	struct ra_spill_interval *interval = &ctx->intervals[dst->name];
+	ra_spill_interval_init(interval, dst);
+}
+
+static void
+insert_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
+{
+	struct ra_spill_interval *interval = &ctx->intervals[dst->name];
+	if (interval->interval.inserted)
+		return;
+
+	ra_spill_ctx_insert(ctx, interval);
+
+	/* For precolored inputs, make sure we leave enough registers to allow for
+	 * holes in the inputs. It can happen that the binning shader has a lower
+	 * register pressure than the main shader, but the main shader decided to
+	 * add holes between the inputs which means that the binning shader has a
+	 * higher register demand.
+	 */
+	if (dst->instr->opc == OPC_META_INPUT &&
+		dst->num != INVALID_REG) {
+		physreg_t physreg = ra_reg_get_physreg(dst);
+		physreg_t max = physreg + reg_size(dst);
+
+		if (interval->interval.reg->flags & IR3_REG_SHARED)
+			ctx->max_pressure.shared = MAX2(ctx->max_pressure.shared, max);
+		else if (interval->interval.reg->flags & IR3_REG_HALF)
+			ctx->max_pressure.half = MAX2(ctx->max_pressure.half, max);
+		else
+			ctx->max_pressure.full = MAX2(ctx->max_pressure.full, max);
+	}
+}
+
+static void
+remove_src_early(struct ra_spill_ctx *ctx, struct ir3_instruction *instr, struct ir3_register *src)
+{
+	if (!(src->flags & IR3_REG_FIRST_KILL))
+		return;
+
+	struct ra_spill_interval *interval = &ctx->intervals[src->def->name];
+
+	if (!interval->interval.inserted || interval->interval.parent ||
+		!rb_tree_is_empty(&interval->interval.children))
+		return;
+
+	ra_spill_ctx_remove(ctx, interval);
+}
+
+static void
+remove_src(struct ra_spill_ctx *ctx, struct ir3_instruction *instr, struct ir3_register *src)
+{
+	if (!(src->flags & IR3_REG_FIRST_KILL))
+		return;
+
+	struct ra_spill_interval *interval = &ctx->intervals[src->def->name];
+
+	if (!interval->interval.inserted)
+		return;
+
+	ra_spill_ctx_remove(ctx, interval);
+}
+
+static void
+remove_dst(struct ra_spill_ctx *ctx, struct ir3_register *dst)
+{
+	struct ra_spill_interval *interval = &ctx->intervals[dst->name];
+
+	if (!interval->interval.inserted)
+		return;
+
+	ra_spill_ctx_remove(ctx, interval);
+}
+
+static void
+update_max_pressure(struct ra_spill_ctx *ctx)
+{
+	d("pressure:");
+	d("\tfull: %u", ctx->cur_pressure.full);
+	d("\thalf: %u", ctx->cur_pressure.half);
+	d("\tshared: %u", ctx->cur_pressure.shared);
+
+	ctx->max_pressure.full =
+		MAX2(ctx->max_pressure.full, ctx->cur_pressure.full);
+	ctx->max_pressure.half =
+		MAX2(ctx->max_pressure.half, ctx->cur_pressure.half);
+	ctx->max_pressure.shared =
+		MAX2(ctx->max_pressure.shared, ctx->cur_pressure.shared);
+}
+
+static void
+handle_instr(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
+{
+	if (RA_DEBUG) {
+		printf("processing: ");
+		ir3_print_instr(instr);
+	}
+
+	ra_foreach_dst(dst, instr) {
+		init_dst(ctx, dst);
+	}
+
+	/* Handle tied destinations. If a destination is tied to a source and that
+	 * source is live-through, then we need to allocate a new register for the
+	 * destination which is live-through itself and cannot overlap the
+	 * sources.
+	 */
+
+	ra_foreach_dst(dst, instr) {
+		if (!ra_reg_is_array_rmw(dst)) {
+			struct ir3_register *tied_src =
+				ra_dst_get_tied_src(ctx->compiler, dst);
+			if (tied_src && !(tied_src->flags & IR3_REG_FIRST_KILL))
+				insert_dst(ctx, dst);
+		}
+	}
+
+	update_max_pressure(ctx);
+
+	ra_foreach_src(src, instr) {
+		if (src->flags & IR3_REG_FIRST_KILL)
+			remove_src_early(ctx, instr, src);
+	}
+
+
+	ra_foreach_dst(dst, instr) {
+		insert_dst(ctx, dst);
+	}
+
+	update_max_pressure(ctx);
+
+	for (unsigned i = 0; i < instr->regs_count; i++) {
+		if (ra_reg_is_src(instr->regs[i]) && 
+			(instr->regs[i]->flags & IR3_REG_FIRST_KILL))
+			remove_src(ctx, instr, instr->regs[i]);
+		else if (ra_reg_is_dst(instr->regs[i]) &&
+				 (instr->regs[i]->flags & IR3_REG_UNUSED))
+			remove_dst(ctx, instr->regs[i]);
+	}
+}
+
+static void
+handle_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
+{
+	init_dst(ctx, instr->regs[0]);
+	insert_dst(ctx, instr->regs[0]);
+}
+
+static void
+remove_input_phi(struct ra_spill_ctx *ctx, struct ir3_instruction *instr)
+{
+	ra_foreach_src(src, instr)
+		remove_src(ctx, instr, src);
+	if (instr->regs[0]->flags & IR3_REG_UNUSED)
+		remove_dst(ctx, instr->regs[0]);
+}
+
+static void
+handle_live_in(struct ra_spill_ctx *ctx, struct ir3_register *def)
+{
+	struct ra_spill_interval *interval = &ctx->intervals[def->name];
+	ra_spill_interval_init(interval, def);
+	insert_dst(ctx, def);
+}
+
+static void
+handle_block(struct ra_spill_ctx *ctx, struct ir3_block *block)
+{
+	memset(&ctx->cur_pressure, 0, sizeof(ctx->cur_pressure));
+	rb_tree_init(&ctx->reg_ctx.intervals);
+
+	unsigned name;
+	BITSET_FOREACH_SET(name, ctx->live->live_in[block->index],
+					   ctx->live->definitions_count) {
+		struct ir3_register *reg = ctx->live->definitions[name];
+		handle_live_in(ctx, reg);
+	}
+
+	foreach_instr (instr, &block->instr_list) {
+		if (instr->opc != OPC_META_PHI && instr->opc != OPC_META_INPUT &&
+			instr->opc != OPC_META_TEX_PREFETCH)
+			break;
+		handle_input_phi(ctx, instr);
+	}
+
+	update_max_pressure(ctx);
+
+	foreach_instr (instr, &block->instr_list) {
+		if (instr->opc == OPC_META_PHI || instr->opc == OPC_META_INPUT ||
+			instr->opc == OPC_META_TEX_PREFETCH)
+			remove_input_phi(ctx, instr);
+		else
+			handle_instr(ctx, instr);
+	}
+}
+
+void
+ir3_calc_pressure(struct ir3_shader_variant *v, struct ir3_liveness *live,
+				  struct ir3_pressure *max_pressure)
+{
+	struct ra_spill_ctx ctx = {};
+	ctx.live = live;
+	ctx.intervals = calloc(live->definitions_count, sizeof(*ctx.intervals));
+	ctx.compiler = v->shader->compiler;
+	spill_ctx_init(&ctx);
+
+	foreach_block (block, &v->ir->block_list) {
+		handle_block(&ctx, block);
+	}
+
+	assert(ctx.cur_pressure.full == 0);
+	assert(ctx.cur_pressure.half == 0);
+	assert(ctx.cur_pressure.shared == 0);
+
+	free(ctx.intervals);
+
+	*max_pressure = ctx.max_pressure;
+}
+
diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build
index b3799f1..c27b267 100644
--- a/src/freedreno/ir3/meson.build
+++ b/src/freedreno/ir3/meson.build
@@ -81,11 +81,13 @@ libfreedreno_ir3_files = files(
   'ir3_delay.c',
   'ir3_dominance.c',
   'ir3_disk_cache.c',
-  'ir3_group.c',
   'ir3_image.c',
   'ir3_image.h',
   'ir3.h',
   'ir3_legalize.c',
+  'ir3_liveness.c',
+  'ir3_lower_parallelcopy.c',
+  'ir3_merge_regs.c',
   'ir3_nir.c',
   'ir3_nir.h',
   'ir3_nir_analyze_ubo_ranges.c',
@@ -100,10 +102,10 @@ libfreedreno_ir3_files = files(
   'ir3_print.c',
   'ir3_ra.c',
   'ir3_ra.h',
-  'ir3_ra_regset.c',
   'ir3_sched.c',
   'ir3_shader.c',
   'ir3_shader.h',
+  'ir3_spill.c',
   'ir3_validate.c',
   'regmask.h',
 )
diff --git a/src/gallium/drivers/freedreno/ci/piglit-freedreno-a530-fails.txt b/src/gallium/drivers/freedreno/ci/piglit-freedreno-a530-fails.txt
index 3a15fd3..cf601d8 100644
--- a/src/gallium/drivers/freedreno/ci/piglit-freedreno-a530-fails.txt
+++ b/src/gallium/drivers/freedreno/ci/piglit-freedreno-a530-fails.txt
@@ -336,6 +336,6 @@ wgl@wgl-multi-context-single-window,Fail
 wgl@wgl-multi-window-single-context,Fail
 wgl@wgl-sanity,Fail
 spec@glsl-1.30@execution@clipping@fs-clip-distance-interpolated,Crash
-spec@glsl-1.30@execution@fs-large-local-array-vec2,Crash
-spec@glsl-1.30@execution@fs-large-local-array-vec3,Crash
-spec@glsl-1.30@execution@fs-large-local-array-vec4,Crash
+spec@glsl-1.30@execution@fs-large-local-array-vec2,Fail
+spec@glsl-1.30@execution@fs-large-local-array-vec3,Fail
+spec@glsl-1.30@execution@fs-large-local-array-vec4,Fail
diff --git a/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-fails.txt b/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-fails.txt
index 5c4d78e..f149728 100644
--- a/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-fails.txt
+++ b/src/gallium/drivers/freedreno/ci/piglit-freedreno-a630-fails.txt
@@ -475,8 +475,6 @@ spec@arb_tessellation_shader@execution@variable-indexing@tes-input-array-float-i
 spec@arb_tessellation_shader@execution@variable-indexing@tes-input-array-vec2-index-rd,Crash
 spec@arb_tessellation_shader@execution@variable-indexing@tes-input-array-vec3-index-rd,Crash
 spec@arb_tessellation_shader@execution@variable-indexing@tes-input-array-vec4-index-rd,Crash
-spec@arb_tessellation_shader@execution@variable-indexing@vs-output-array-vec3-index-wr-before-tcs,Fail
-spec@arb_tessellation_shader@execution@variable-indexing@vs-output-array-vec4-index-wr-before-tcs,Fail
 spec@arb_tessellation_shader@execution@vertex-partial-write,Crash
 spec@arb_tessellation_shader@execution@vs-tes-max-in-out-components,Fail
 spec@arb_tessellation_shader@execution@vs-tes-tessinner-tessouter-inputs-quads,Fail
@@ -507,11 +505,8 @@ spec@glsl-1.50@execution@compatibility@vs-gs-texcoord-array-2,Crash
 spec@glsl-1.50@execution@geometry@max-input-components,Fail
 spec@glsl-1.50@execution@primitive-id-no-gs-quad-strip,Fail
 spec@glsl-1.50@execution@primitive-id-no-gs-quads,Fail
-spec@glsl-1.50@execution@variable-indexing@gs-input-array-float-index-rd,Fail
 spec@glsl-1.50@execution@variable-indexing@gs-input-array-vec2-index-rd,Fail
 spec@glsl-1.50@execution@variable-indexing@gs-input-array-vec3-index-rd,Fail
 spec@glsl-1.50@execution@variable-indexing@gs-output-array-vec3-index-wr,Fail
 spec@glsl-1.50@execution@variable-indexing@gs-output-array-vec4-index-wr,Crash
-spec@glsl-1.50@execution@variable-indexing@vs-output-array-vec3-index-wr-before-gs,Fail
 spec@glsl-1.50@execution@variable-indexing@vs-output-array-vec4-index-wr-before-gs,Fail
-spec@glsl-es-3.10@execution@cs-image-atomic-if-else-2,Fail
-- 
2.7.4