From c9b7a37b8f7979433655e269a2b161d33eb41659 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Tue, 14 Aug 2018 02:01:18 -0400
Subject: [PATCH] radeonsi: cull primitives with async compute for large draw
 calls
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Tested-by: Dieter NÃ¼tzel <Dieter@nuetzel-hh.de>
Acked-by: Nicolai HÃ¤hnle <nicolai.haehnle@amd.com>
---
 src/gallium/drivers/radeonsi/Makefile.sources      |    1 +
 src/gallium/drivers/radeonsi/meson.build           |    1 +
 .../drivers/radeonsi/si_compute_prim_discard.c     | 1567 ++++++++++++++++++++
 src/gallium/drivers/radeonsi/si_cp_dma.c           |    8 +-
 src/gallium/drivers/radeonsi/si_debug.c            |   32 +-
 src/gallium/drivers/radeonsi/si_fence.c            |    8 +-
 src/gallium/drivers/radeonsi/si_gfx_cs.c           |   80 +
 src/gallium/drivers/radeonsi/si_pipe.c             |   12 +-
 src/gallium/drivers/radeonsi/si_pipe.h             |   63 +-
 src/gallium/drivers/radeonsi/si_query.c            |    6 +
 src/gallium/drivers/radeonsi/si_shader.c           |   63 +
 src/gallium/drivers/radeonsi/si_shader.h           |   14 +
 src/gallium/drivers/radeonsi/si_state.c            |    9 +
 src/gallium/drivers/radeonsi/si_state.h            |    5 +
 src/gallium/drivers/radeonsi/si_state_draw.c       |  252 +++-
 src/gallium/drivers/radeonsi/si_state_msaa.c       |    4 +
 src/gallium/drivers/radeonsi/si_state_shaders.c    |   21 +-
 src/gallium/drivers/radeonsi/si_state_viewport.c   |    6 +
 18 files changed, 2124 insertions(+), 28 deletions(-)
 create mode 100644 src/gallium/drivers/radeonsi/si_compute_prim_discard.c

diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources
index 713629c..62747f5 100644
--- a/src/gallium/drivers/radeonsi/Makefile.sources
+++ b/src/gallium/drivers/radeonsi/Makefile.sources
@@ -10,6 +10,7 @@ C_SOURCES := \
 	si_build_pm4.h \
 	si_clear.c \
 	si_compute.c \
+	si_compute_prim_discard.c \
 	si_compute.h \
 	si_compute_blit.c \
 	si_cp_dma.c \
diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build
index cf3b24c..ae216bc 100644
--- a/src/gallium/drivers/radeonsi/meson.build
+++ b/src/gallium/drivers/radeonsi/meson.build
@@ -26,6 +26,7 @@ files_libradeonsi = files(
   'si_build_pm4.h',
   'si_clear.c',
   'si_compute.c',
+  'si_compute_prim_discard.c',
   'si_compute.h',
   'si_compute_blit.c',
   'si_cp_dma.c',
diff --git a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
new file mode 100644
index 0000000..71253c5
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
@@ -0,0 +1,1567 @@
+/*
+ * Copyright 2019 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "si_pipe.h"
+#include "si_shader_internal.h"
+#include "sid.h"
+#include "si_build_pm4.h"
+#include "ac_llvm_cull.h"
+
+#include "util/u_prim.h"
+#include "util/u_suballoc.h"
+#include "util/u_upload_mgr.h"
+#include "util/fast_idiv_by_const.h"
+
+/* Based on:
+ * https://frostbite-wp-prd.s3.amazonaws.com/wp-content/uploads/2016/03/29204330/GDC_2016_Compute.pdf
+ */
+
+/* This file implements primitive culling using asynchronous compute.
+ * It's written to be GL conformant.
+ *
+ * It takes a monolithic VS in LLVM IR returning gl_Position and invokes it
+ * in a compute shader. The shader processes 1 primitive/thread by invoking
+ * the VS for each vertex to get the positions, decomposes strips and fans
+ * into triangles (if needed), eliminates primitive restart (if needed),
+ * does (W<0) culling, face culling, view XY culling, zero-area and
+ * small-primitive culling, and generates a new index buffer that doesn't
+ * contain culled primitives.
+ *
+ * The index buffer is generated using the Ordered Count feature of GDS,
+ * which is an atomic counter that is incremented in the wavefront launch
+ * order, so that the original primitive order is preserved.
+ *
+ * Another GDS ordered counter is used to eliminate primitive restart indices.
+ * If a restart index lands on an even thread ID, the compute shader has to flip
+ * the primitive orientation of the whole following triangle strip. The primitive
+ * orientation has to be correct after strip and fan decomposition for two-sided
+ * shading to behave correctly. The decomposition also needs to be aware of
+ * which vertex is the provoking vertex for flat shading to behave correctly.
+ *
+ * IB = a GPU command buffer
+ *
+ * Both the compute and gfx IBs run in parallel sort of like CE and DE.
+ * The gfx IB has a CP barrier (REWIND packet) before a draw packet. REWIND
+ * doesn't continue if its word isn't 0x80000000. Once compute shaders are
+ * finished culling, the last wave will write the final primitive count from
+ * GDS directly into the count word of the draw packet in the gfx IB, and
+ * a CS_DONE event will signal the REWIND packet to continue. It's really
+ * a direct draw with command buffer patching from the compute queue.
+ *
+ * The compute IB doesn't have to start when its corresponding gfx IB starts,
+ * but can start sooner. The compute IB is signaled to start after the last
+ * execution barrier in the *previous* gfx IB. This is handled as follows.
+ * The kernel GPU scheduler starts the compute IB after the previous gfx IB has
+ * started. The compute IB then waits (WAIT_REG_MEM) for a mid-IB fence that
+ * represents the barrier in the previous gfx IB.
+ *
+ * Features:
+ * - Triangle strips and fans are decomposed into an indexed triangle list.
+ *   The decomposition differs based on the provoking vertex state.
+ * - Instanced draws are converted into non-instanced draws for 16-bit indices.
+ *   (InstanceID is stored in the high bits of VertexID and unpacked by VS)
+ * - Primitive restart is fully supported with triangle strips, including
+ *   correct primitive orientation across multiple waves. (restart indices
+ *   reset primitive orientation)
+ * - W<0 culling (W<0 is behind the viewer, sort of like near Z culling).
+ * - Back face culling, incl. culling zero-area / degenerate primitives.
+ * - View XY culling.
+ * - View Z culling (disabled due to limited impact with perspective projection).
+ * - Small primitive culling for all MSAA modes and all quant modes.
+ *
+ * The following are not implemented:
+ * - ClipVertex/ClipDistance/CullDistance-based culling.
+ * - Scissor culling.
+ * - HiZ culling.
+ *
+ * Limitations (and unimplemented features that may be possible to implement):
+ * - Only triangles, triangle strips, and triangle fans are supported.
+ * - Primitive restart is only supported with triangle strips.
+ * - Instancing and primitive restart can't be used together.
+ * - Instancing is only supported with 16-bit indices and instance count <= 2^16.
+ * - The instance divisor buffer is unavailable, so all divisors must be
+ *   either 0 or 1.
+ * - Multidraws where the vertex shader reads gl_DrawID are unsupported.
+ * - No support for tessellation and geometry shaders.
+ *   (patch elimination where tess factors are 0 would be possible to implement)
+ * - The vertex shader must not contain memory stores.
+ * - All VS resources must not have a write usage in the command buffer.
+ *   (TODO: all shader buffers currently set the write usage)
+ * - Bindless textures and images must not occur in the vertex shader.
+ *
+ * User data SGPR layout:
+ *   INDEX_BUFFERS: pointer to constants
+ *     0..3: input index buffer - typed buffer view
+ *     4..7: output index buffer - typed buffer view
+ *     8..11: viewport state - scale.xy, translate.xy
+ *   VERTEX_COUNTER: counter address or first primitive ID
+ *     - If unordered memory counter: address of "count" in the draw packet
+ *       and is incremented atomically by the shader.
+ *     - If unordered GDS counter: address of "count" in GDS starting from 0,
+ *       must be initialized to 0 before the dispatch.
+ *     - If ordered GDS counter: the primitive ID that should reset the vertex
+ *       counter to 0 in GDS
+ *   LAST_WAVE_PRIM_ID: the primitive ID that should write the final vertex
+ *       count to memory if using GDS ordered append
+ *   VERTEX_COUNT_ADDR: where the last wave should write the vertex count if
+ *       using GDS ordered append
+ *   VS.VERTEX_BUFFERS:           same value as VS
+ *   VS.CONST_AND_SHADER_BUFFERS: same value as VS
+ *   VS.SAMPLERS_AND_IMAGES:      same value as VS
+ *   VS.BASE_VERTEX:              same value as VS
+ *   VS.START_INSTANCE:           same value as VS
+ *   NUM_PRIMS_UDIV_MULTIPLIER: For fast 31-bit division by the number of primitives
+ *       per instance for instancing.
+ *   NUM_PRIMS_UDIV_TERMS:
+ *     - Bits [0:4]: "post_shift" for fast 31-bit division for instancing.
+ *     - Bits [5:31]: The number of primitives per instance for computing the remainder.
+ *   PRIMITIVE_RESTART_INDEX
+ *   SMALL_PRIM_CULLING_PRECISION: Scale the primitive bounding box by this number.
+ *
+ *
+ * The code contains 3 codepaths:
+ * - Unordered memory counter (for debugging, random primitive order, no primitive restart)
+ * - Unordered GDS counter (for debugging, random primitive order, no primitive restart)
+ * - Ordered GDS counter (it preserves the primitive order)
+ *
+ * How to test primitive restart (the most complicated part because it needs
+ * to get the primitive orientation right):
+ *   Set THREADGROUP_SIZE to 2 to exercise both intra-wave and inter-wave
+ *   primitive orientation flips with small draw calls, which is what most tests use.
+ *   You can also enable draw call splitting into draw calls with just 2 primitives.
+ */
+
+/* At least 256 is needed for the fastest wave launch rate from compute queues
+ * due to hw constraints. Nothing in the code needs more than 1 wave/threadgroup. */
+#define THREADGROUP_SIZE		256 /* high numbers limit available VGPRs */
+#define THREADGROUPS_PER_CU		1 /* TGs to launch on 1 CU before going onto the next, max 8 */
+#define MAX_WAVES_PER_SH		0 /* no limit */
+#define INDEX_STORES_USE_SLC		1 /* don't cache indices if L2 is full */
+/* Don't cull Z. We already do (W < 0) culling for primitives behind the viewer. */
+#define CULL_Z				0
+/* 0 = unordered memory counter, 1 = unordered GDS counter, 2 = ordered GDS counter */
+#define VERTEX_COUNTER_GDS_MODE		2
+#define GDS_SIZE_UNORDERED		(4 * 1024) /* only for the unordered GDS counter */
+
+/* Grouping compute dispatches for small draw calls: How many primitives from multiple
+ * draw calls to process by compute before signaling the gfx IB. This reduces the number
+ * of EOP events + REWIND packets, because they decrease performance. */
+#define PRIMS_PER_BATCH			(512 * 1024)
+/* Draw call splitting at the packet level. This allows signaling the gfx IB
+ * for big draw calls sooner, but doesn't allow context flushes between packets.
+ * Primitive restart is supported. Only implemented for ordered append. */
+#define SPLIT_PRIMS_PACKET_LEVEL_VALUE	PRIMS_PER_BATCH
+/* If there is not enough ring buffer space for the current IB, split draw calls into
+ * this number of primitives, so that we can flush the context and get free ring space. */
+#define SPLIT_PRIMS_DRAW_LEVEL		PRIMS_PER_BATCH
+
+/* Derived values. */
+#define WAVES_PER_TG			DIV_ROUND_UP(THREADGROUP_SIZE, 64)
+#define SPLIT_PRIMS_PACKET_LEVEL	(VERTEX_COUNTER_GDS_MODE == 2 ? \
+					 SPLIT_PRIMS_PACKET_LEVEL_VALUE : \
+					 UINT_MAX & ~(THREADGROUP_SIZE - 1))
+
+#define REWIND_SIGNAL_BIT		0x80000000
+/* For emulating the rewind packet on CI. */
+#define FORCE_REWIND_EMULATION		0
+
+void si_initialize_prim_discard_tunables(struct si_context *sctx)
+{
+	sctx->prim_discard_vertex_count_threshold = UINT_MAX; /* disable */
+
+	if (sctx->chip_class == GFX6 || /* SI support is not implemented */
+	    !sctx->screen->info.has_gds_ordered_append ||
+	    sctx->screen->debug_flags & DBG(NO_PD) ||
+	    /* If aux_context == NULL, we are initializing aux_context right now. */
+	    !sctx->screen->aux_context)
+		return;
+
+	/* TODO: enable this after the GDS kernel memory management is fixed */
+	bool enable_on_pro_graphics_by_default = false;
+
+	if (sctx->screen->debug_flags & DBG(ALWAYS_PD) ||
+	    sctx->screen->debug_flags & DBG(PD) ||
+	    (enable_on_pro_graphics_by_default &&
+	     sctx->screen->info.is_pro_graphics &&
+	     (sctx->family == CHIP_BONAIRE ||
+	      sctx->family == CHIP_HAWAII ||
+	      sctx->family == CHIP_TONGA ||
+	      sctx->family == CHIP_FIJI ||
+	      sctx->family == CHIP_POLARIS10 ||
+	      sctx->family == CHIP_POLARIS11 ||
+	      sctx->family == CHIP_VEGA10 ||
+	      sctx->family == CHIP_VEGA20))) {
+		sctx->prim_discard_vertex_count_threshold = 6000 * 3; /* 6K triangles */
+
+		if (sctx->screen->debug_flags & DBG(ALWAYS_PD))
+			sctx->prim_discard_vertex_count_threshold = 0; /* always enable */
+
+		const uint32_t MB = 1024 * 1024;
+		const uint64_t GB = 1024 * 1024 * 1024;
+
+		/* The total size is double this per context.
+		 * Greater numbers allow bigger gfx IBs.
+		 */
+		if (sctx->screen->info.vram_size <= 2 * GB)
+			sctx->index_ring_size_per_ib = 64 * MB;
+		else if (sctx->screen->info.vram_size <= 4 * GB)
+			sctx->index_ring_size_per_ib = 128 * MB;
+		else
+			sctx->index_ring_size_per_ib = 256 * MB;
+	}
+}
+
+/* Opcode can be "add" or "swap". */
+static LLVMValueRef
+si_build_ds_ordered_op(struct si_shader_context *ctx, const char *opcode,
+		       LLVMValueRef m0, LLVMValueRef value, unsigned ordered_count_index,
+		       bool release, bool done)
+{
+	LLVMValueRef args[] = {
+		LLVMBuildIntToPtr(ctx->ac.builder, m0,
+				  LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GDS), ""),
+		value,
+		LLVMConstInt(ctx->i32, LLVMAtomicOrderingMonotonic, 0), /* ordering */
+		ctx->i32_0, /* scope */
+		ctx->i1false, /* volatile */
+		LLVMConstInt(ctx->i32, ordered_count_index, 0),
+		LLVMConstInt(ctx->i1, release, 0),
+		LLVMConstInt(ctx->i1, done, 0),
+	};
+
+	char intrinsic[64];
+	snprintf(intrinsic, sizeof(intrinsic), "llvm.amdgcn.ds.ordered.%s", opcode);
+	return ac_build_intrinsic(&ctx->ac, intrinsic, ctx->i32, args, ARRAY_SIZE(args), 0);
+}
+
+static LLVMValueRef si_expand_32bit_pointer(struct si_shader_context *ctx, LLVMValueRef ptr)
+{
+	uint64_t hi = (uint64_t)ctx->screen->info.address32_hi << 32;
+	ptr = LLVMBuildZExt(ctx->ac.builder, ptr, ctx->i64, "");
+	ptr = LLVMBuildOr(ctx->ac.builder, ptr, LLVMConstInt(ctx->i64, hi, 0), "");
+	return LLVMBuildIntToPtr(ctx->ac.builder, ptr,
+				 LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GLOBAL), "");
+}
+
+struct si_thread0_section {
+	struct si_shader_context *ctx;
+	struct lp_build_if_state if_thread0;
+	LLVMValueRef vgpr_result; /* a VGPR for the value on thread 0. */
+	LLVMValueRef saved_exec;
+};
+
+/* Enter a section that only executes on thread 0. */
+static void si_enter_thread0_section(struct si_shader_context *ctx,
+				     struct si_thread0_section *section,
+				     LLVMValueRef thread_id)
+{
+	section->ctx = ctx;
+	section->vgpr_result = ac_build_alloca_undef(&ctx->ac, ctx->i32, "result0");
+
+	/* This IF has 4 instructions:
+	 *   v_and_b32_e32 v, 63, v         ; get the thread ID
+	 *   v_cmp_eq_u32_e32 vcc, 0, v     ; thread ID == 0
+	 *   s_and_saveexec_b64 s, vcc
+	 *   s_cbranch_execz BB0_4
+	 *
+	 * It could just be s_and_saveexec_b64 s, 1.
+	 */
+	lp_build_if(&section->if_thread0, &ctx->gallivm,
+		    LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, thread_id,
+				  ctx->i32_0, ""));
+}
+
+/* Exit a section that only executes on thread 0 and broadcast the result
+ * to all threads. */
+static void si_exit_thread0_section(struct si_thread0_section *section,
+				    LLVMValueRef *result)
+{
+	struct si_shader_context *ctx = section->ctx;
+
+	LLVMBuildStore(ctx->ac.builder, *result, section->vgpr_result);
+
+	lp_build_endif(&section->if_thread0);
+
+	/* Broadcast the result from thread 0 to all threads. */
+	*result = ac_build_readlane(&ctx->ac,
+			LLVMBuildLoad(ctx->ac.builder, section->vgpr_result, ""), NULL);
+}
+
+void si_build_prim_discard_compute_shader(struct si_shader_context *ctx)
+{
+	struct si_shader_key *key = &ctx->shader->key;
+	LLVMBuilderRef builder = ctx->ac.builder;
+	LLVMValueRef vs = ctx->main_fn;
+
+	/* Always inline the VS function. */
+	ac_add_function_attr(ctx->ac.context, vs, -1, AC_FUNC_ATTR_ALWAYSINLINE);
+	LLVMSetLinkage(vs, LLVMPrivateLinkage);
+
+	LLVMTypeRef const_desc_type;
+	if (ctx->shader->selector->info.const_buffers_declared == 1 &&
+	    ctx->shader->selector->info.shader_buffers_declared == 0)
+		const_desc_type = ctx->f32;
+	else
+		const_desc_type = ctx->v4i32;
+
+	struct si_function_info fninfo;
+	si_init_function_info(&fninfo);
+
+	LLVMValueRef index_buffers_and_constants, vertex_counter, vb_desc, const_desc;
+	LLVMValueRef base_vertex, start_instance, block_id, local_id, ordered_wave_id;
+	LLVMValueRef restart_index, vp_scale[2], vp_translate[2], smallprim_precision;
+	LLVMValueRef num_prims_udiv_multiplier, num_prims_udiv_terms, sampler_desc;
+	LLVMValueRef last_wave_prim_id, vertex_count_addr;
+
+	add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v4i32),
+		       &index_buffers_and_constants);
+	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &vertex_counter);
+	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &last_wave_prim_id);
+	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &vertex_count_addr);
+	add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v4i32),
+		       &vb_desc);
+	add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(const_desc_type),
+		       &const_desc);
+	add_arg_assign(&fninfo, ARG_SGPR, ac_array_in_const32_addr_space(ctx->v8i32),
+		       &sampler_desc);
+	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &base_vertex);
+	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &start_instance);
+	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &num_prims_udiv_multiplier);
+	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &num_prims_udiv_terms);
+	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &restart_index);
+	add_arg_assign(&fninfo, ARG_SGPR, ctx->f32, &smallprim_precision);
+
+	/* Block ID and thread ID inputs. */
+	add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &block_id);
+	if (VERTEX_COUNTER_GDS_MODE == 2)
+		add_arg_assign(&fninfo, ARG_SGPR, ctx->i32, &ordered_wave_id);
+	add_arg_assign(&fninfo, ARG_VGPR, ctx->i32, &local_id);
+
+	/* Create the compute shader function. */
+	unsigned old_type = ctx->type;
+	ctx->type = PIPE_SHADER_COMPUTE;
+	si_create_function(ctx, "prim_discard_cs", NULL, 0, &fninfo, THREADGROUP_SIZE);
+	ctx->type = old_type;
+
+	if (VERTEX_COUNTER_GDS_MODE == 1) {
+		ac_llvm_add_target_dep_function_attr(ctx->main_fn, "amdgpu-gds-size",
+						     GDS_SIZE_UNORDERED);
+	}
+
+	/* Assemble parameters for VS. */
+	LLVMValueRef vs_params[16];
+	unsigned num_vs_params = 0;
+	unsigned param_vertex_id, param_instance_id;
+
+	vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 0))); /* RW_BUFFERS */
+	vs_params[num_vs_params++] = LLVMGetUndef(LLVMTypeOf(LLVMGetParam(vs, 1))); /* BINDLESS */
+	vs_params[num_vs_params++] = const_desc;
+	vs_params[num_vs_params++] = sampler_desc;
+	vs_params[num_vs_params++] = LLVMConstInt(ctx->i32,
+					S_VS_STATE_INDEXED(key->opt.cs_indexed), 0);
+	vs_params[num_vs_params++] = base_vertex;
+	vs_params[num_vs_params++] = start_instance;
+	vs_params[num_vs_params++] = ctx->i32_0; /* DrawID */
+	vs_params[num_vs_params++] = vb_desc;
+
+	vs_params[(param_vertex_id = num_vs_params++)] = NULL; /* VertexID */
+	vs_params[(param_instance_id = num_vs_params++)] = NULL; /* InstanceID */
+	vs_params[num_vs_params++] = ctx->i32_0; /* unused (PrimID) */
+	vs_params[num_vs_params++] = ctx->i32_0; /* unused */
+
+	assert(num_vs_params <= ARRAY_SIZE(vs_params));
+	assert(num_vs_params == LLVMCountParamTypes(LLVMGetElementType(LLVMTypeOf(vs))));
+
+	/* Load descriptors. (load 8 dwords at once) */
+	LLVMValueRef input_indexbuf, output_indexbuf, tmp, desc[8];
+
+	tmp = LLVMBuildPointerCast(builder, index_buffers_and_constants,
+				   ac_array_in_const32_addr_space(ctx->v8i32), "");
+	tmp = ac_build_load_to_sgpr(&ctx->ac, tmp, ctx->i32_0);
+
+	for (unsigned i = 0; i < 8; i++)
+		desc[i] = ac_llvm_extract_elem(&ctx->ac, tmp, i);
+
+	input_indexbuf = ac_build_gather_values(&ctx->ac, desc, 4);
+	output_indexbuf = ac_build_gather_values(&ctx->ac, desc + 4, 4);
+
+	/* Compute PrimID and InstanceID. */
+	LLVMValueRef global_thread_id =
+		ac_build_imad(&ctx->ac, block_id,
+			      LLVMConstInt(ctx->i32, THREADGROUP_SIZE, 0), local_id);
+	LLVMValueRef prim_id = global_thread_id; /* PrimID within an instance */
+	LLVMValueRef instance_id = ctx->i32_0;
+
+	if (key->opt.cs_instancing) {
+		/* Unpack num_prims_udiv_terms. */
+		LLVMValueRef post_shift = LLVMBuildAnd(builder, num_prims_udiv_terms,
+						       LLVMConstInt(ctx->i32, 0x1f, 0), "");
+		LLVMValueRef prims_per_instance = LLVMBuildLShr(builder, num_prims_udiv_terms,
+								LLVMConstInt(ctx->i32, 5, 0), "");
+		/* Divide the total prim_id by the number of prims per instance. */
+		instance_id = ac_build_fast_udiv_u31_d_not_one(&ctx->ac, prim_id,
+							       num_prims_udiv_multiplier,
+							       post_shift);
+		/* Compute the remainder. */
+		prim_id = LLVMBuildSub(builder, prim_id,
+				       LLVMBuildMul(builder, instance_id,
+						    prims_per_instance, ""), "");
+	}
+
+	/* Generate indices (like a non-indexed draw call). */
+	LLVMValueRef index[4] = {NULL, NULL, NULL, LLVMGetUndef(ctx->i32)};
+	unsigned vertices_per_prim = 3;
+
+	switch (key->opt.cs_prim_type) {
+	case PIPE_PRIM_TRIANGLES:
+		for (unsigned i = 0; i < 3; i++) {
+			index[i] = ac_build_imad(&ctx->ac, prim_id,
+						 LLVMConstInt(ctx->i32, 3, 0),
+						 LLVMConstInt(ctx->i32, i, 0));
+		}
+		break;
+	case PIPE_PRIM_TRIANGLE_STRIP:
+		for (unsigned i = 0; i < 3; i++) {
+			index[i] = LLVMBuildAdd(builder, prim_id,
+						LLVMConstInt(ctx->i32, i, 0), "");
+		}
+		break;
+	case PIPE_PRIM_TRIANGLE_FAN:
+		/* Vertex 1 is first and vertex 2 is last. This will go to the hw clipper
+		 * and rasterizer as a normal triangle, so we need to put the provoking
+		 * vertex into the correct index variable and preserve orientation at the same time.
+		 * gl_VertexID is preserved, because it's equal to the index.
+		 */
+		if (key->opt.cs_provoking_vertex_first) {
+			index[0] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 1, 0), "");
+			index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 2, 0), "");
+			index[2] = ctx->i32_0;
+		} else {
+			index[0] = ctx->i32_0;
+			index[1] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 1, 0), "");
+			index[2] = LLVMBuildAdd(builder, prim_id, LLVMConstInt(ctx->i32, 2, 0), "");
+		}
+		break;
+	default:
+		unreachable("unexpected primitive type");
+	}
+
+	/* Fetch indices. */
+	if (key->opt.cs_indexed) {
+		for (unsigned i = 0; i < 3; i++) {
+			index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf,
+							       index[i], ctx->i32_0, 1,
+							       false, true);
+			index[i] = ac_to_integer(&ctx->ac, index[i]);
+		}
+	}
+
+	/* Extract the ordered wave ID. */
+	if (VERTEX_COUNTER_GDS_MODE == 2) {
+		ordered_wave_id = LLVMBuildLShr(builder, ordered_wave_id,
+						LLVMConstInt(ctx->i32, 6, 0), "");
+		ordered_wave_id = LLVMBuildAnd(builder, ordered_wave_id,
+					       LLVMConstInt(ctx->i32, 0xfff, 0), "");
+	}
+	LLVMValueRef thread_id =
+		LLVMBuildAnd(builder, local_id, LLVMConstInt(ctx->i32, 63, 0), "");
+
+	/* Every other triangle in a strip has a reversed vertex order, so we
+	 * need to swap vertices of odd primitives to get the correct primitive
+	 * orientation when converting triangle strips to triangles. Primitive
+	 * restart complicates it, because a strip can start anywhere.
+	 */
+	LLVMValueRef prim_restart_accepted = ctx->i1true;
+
+	if (key->opt.cs_prim_type == PIPE_PRIM_TRIANGLE_STRIP) {
+		/* Without primitive restart, odd primitives have reversed orientation.
+		 * Only primitive restart can flip it with respect to the first vertex
+		 * of the draw call.
+		 */
+		LLVMValueRef first_is_odd = ctx->i1false;
+
+		/* Handle primitive restart. */
+		if (key->opt.cs_primitive_restart) {
+			/* Get the GDS primitive restart continue flag and clear
+			 * the flag in vertex_counter. This flag is used when the draw
+			 * call was split and we need to load the primitive orientation
+			 * flag from GDS for the first wave too.
+			 */
+			LLVMValueRef gds_prim_restart_continue =
+				LLVMBuildLShr(builder, vertex_counter,
+					      LLVMConstInt(ctx->i32, 31, 0), "");
+			gds_prim_restart_continue =
+				LLVMBuildTrunc(builder, gds_prim_restart_continue, ctx->i1, "");
+			vertex_counter = LLVMBuildAnd(builder, vertex_counter,
+						      LLVMConstInt(ctx->i32, 0x7fffffff, 0), "");
+
+			LLVMValueRef index0_is_reset;
+
+			for (unsigned i = 0; i < 3; i++) {
+				LLVMValueRef not_reset = LLVMBuildICmp(builder, LLVMIntNE, index[i],
+								       restart_index, "");
+				if (i == 0)
+					index0_is_reset = LLVMBuildNot(builder, not_reset, "");
+				prim_restart_accepted = LLVMBuildAnd(builder, prim_restart_accepted,
+								     not_reset, "");
+			}
+
+			/* If the previous waves flip the primitive orientation
+			 * of the current triangle strip, it will be stored in GDS.
+			 *
+			 * Sometimes the correct orientation is not needed, in which case
+			 * we don't need to execute this.
+			 */
+			if (key->opt.cs_need_correct_orientation && VERTEX_COUNTER_GDS_MODE == 2) {
+				/* If there are reset indices in this wave, get the thread index
+				 * where the most recent strip starts relative to each thread.
+				 */
+				LLVMValueRef preceding_threads_mask =
+					LLVMBuildSub(builder,
+						     LLVMBuildShl(builder, ctx->ac.i64_1,
+								  LLVMBuildZExt(builder, thread_id, ctx->i64, ""), ""),
+						     ctx->ac.i64_1, "");
+
+				LLVMValueRef reset_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, index0_is_reset);
+				LLVMValueRef preceding_reset_threadmask =
+					LLVMBuildAnd(builder, reset_threadmask, preceding_threads_mask, "");
+				LLVMValueRef strip_start =
+					ac_build_umsb(&ctx->ac, preceding_reset_threadmask, NULL);
+				strip_start = LLVMBuildAdd(builder, strip_start, ctx->i32_1, "");
+
+				/* This flips the orientatino based on reset indices within this wave only. */
+				first_is_odd = LLVMBuildTrunc(builder, strip_start, ctx->i1, "");
+
+				LLVMValueRef last_strip_start, prev_wave_state, ret, tmp;
+				LLVMValueRef is_first_wave, current_wave_resets_index;
+
+				/* Get the thread index where the last strip starts in this wave.
+				 *
+				 * If the last strip doesn't start in this wave, the thread index
+				 * will be 0.
+				 *
+				 * If the last strip starts in the next wave, the thread index will
+				 * be 64.
+				 */
+				last_strip_start = ac_build_umsb(&ctx->ac, reset_threadmask, NULL);
+				last_strip_start = LLVMBuildAdd(builder, last_strip_start, ctx->i32_1, "");
+
+				struct si_thread0_section section;
+				si_enter_thread0_section(ctx, &section, thread_id);
+
+				/* This must be done in the thread 0 section, because
+				 * we expect PrimID to be 0 for the whole first wave
+				 * in this expression.
+				 *
+				 * NOTE: This will need to be different if we wanna support
+				 * instancing with primitive restart.
+				 */
+				is_first_wave = LLVMBuildICmp(builder, LLVMIntEQ, prim_id, ctx->i32_0, "");
+				is_first_wave = LLVMBuildAnd(builder, is_first_wave,
+							     LLVMBuildNot(builder,
+									  gds_prim_restart_continue, ""), "");
+				current_wave_resets_index = LLVMBuildICmp(builder, LLVMIntNE,
+									  last_strip_start, ctx->i32_0, "");
+
+				ret = ac_build_alloca_undef(&ctx->ac, ctx->i32, "prev_state");
+
+				/* Save the last strip start primitive index in GDS and read
+				 * the value that previous waves stored.
+				 *
+				 * if (is_first_wave || current_wave_resets_strip)
+				 *    // Read the value that previous waves stored and store a new one.
+				 *    first_is_odd = ds.ordered.swap(last_strip_start);
+				 * else
+				 *    // Just read the value that previous waves stored.
+				 *    first_is_odd = ds.ordered.add(0);
+				 */
+				struct lp_build_if_state if_overwrite_counter;
+				lp_build_if(&if_overwrite_counter, &ctx->gallivm,
+					    LLVMBuildOr(builder, is_first_wave,
+							current_wave_resets_index, ""));
+				{
+					/* The GDS address is always 0 with ordered append. */
+					tmp = si_build_ds_ordered_op(ctx, "swap",
+								     ordered_wave_id, last_strip_start,
+								     1, true, false);
+					LLVMBuildStore(builder, tmp, ret);
+				}
+				lp_build_else(&if_overwrite_counter);
+				{
+					/* Just read the value from GDS. */
+					tmp = si_build_ds_ordered_op(ctx, "add",
+								     ordered_wave_id, ctx->i32_0,
+								     1, true, false);
+					LLVMBuildStore(builder, tmp, ret);
+				}
+				lp_build_endif(&if_overwrite_counter);
+
+				prev_wave_state = LLVMBuildLoad(builder, ret, "");
+				/* Ignore the return value if this is the first wave. */
+				prev_wave_state = LLVMBuildSelect(builder, is_first_wave,
+								  ctx->i32_0, prev_wave_state, "");
+				si_exit_thread0_section(&section, &prev_wave_state);
+				prev_wave_state = LLVMBuildTrunc(builder, prev_wave_state, ctx->i1, "");
+
+				/* If the strip start appears to be on thread 0 for the current primitive
+				 * (meaning the reset index is not present in this wave and might have
+				 * appeared in previous waves), use the value from GDS to determine
+				 * primitive orientation.
+				 *
+				 * If the strip start is in this wave for the current primitive, use
+				 * the value from the current wave to determine primitive orientation.
+				 */
+				LLVMValueRef strip_start_is0 = LLVMBuildICmp(builder, LLVMIntEQ,
+									     strip_start, ctx->i32_0, "");
+				first_is_odd = LLVMBuildSelect(builder, strip_start_is0, prev_wave_state,
+							       first_is_odd, "");
+			}
+		}
+		/* prim_is_odd = (first_is_odd + current_is_odd) % 2. */
+		LLVMValueRef prim_is_odd =
+			LLVMBuildXor(builder, first_is_odd,
+				     LLVMBuildTrunc(builder, thread_id, ctx->i1, ""), "");
+
+		/* Determine the primitive orientation.
+		 * Only swap the vertices that are not the provoking vertex. We need to keep
+		 * the provoking vertex in place.
+		 */
+		if (key->opt.cs_provoking_vertex_first) {
+			LLVMValueRef index1 = index[1];
+			LLVMValueRef index2 = index[2];
+			index[1] = LLVMBuildSelect(builder, prim_is_odd, index2, index1, "");
+			index[2] = LLVMBuildSelect(builder, prim_is_odd, index1, index2, "");
+		} else {
+			LLVMValueRef index0 = index[0];
+			LLVMValueRef index1 = index[1];
+			index[0] = LLVMBuildSelect(builder, prim_is_odd, index1, index0, "");
+			index[1] = LLVMBuildSelect(builder, prim_is_odd, index0, index1, "");
+		}
+	}
+
+	/* Execute the vertex shader for each vertex to get vertex positions. */
+	LLVMValueRef pos[3][4];
+	for (unsigned i = 0; i < vertices_per_prim; i++) {
+		vs_params[param_vertex_id] = index[i];
+		vs_params[param_instance_id] = instance_id;
+
+		LLVMValueRef ret = LLVMBuildCall(builder, vs, vs_params, num_vs_params, "");
+		for (unsigned chan = 0; chan < 4; chan++)
+			pos[i][chan] = LLVMBuildExtractValue(builder, ret, chan, "");
+	}
+
+	/* Divide XYZ by W. */
+	for (unsigned i = 0; i < vertices_per_prim; i++) {
+		for (unsigned chan = 0; chan < 3; chan++)
+			pos[i][chan] = ac_build_fdiv(&ctx->ac, pos[i][chan], pos[i][3]);
+	}
+
+	/* Load the viewport state. */
+	LLVMValueRef vp = ac_build_load_invariant(&ctx->ac, index_buffers_and_constants,
+						  LLVMConstInt(ctx->i32, 2, 0));
+	vp = LLVMBuildBitCast(builder, vp, ctx->v4f32, "");
+	vp_scale[0] = ac_llvm_extract_elem(&ctx->ac, vp, 0);
+	vp_scale[1] = ac_llvm_extract_elem(&ctx->ac, vp, 1);
+	vp_translate[0] = ac_llvm_extract_elem(&ctx->ac, vp, 2);
+	vp_translate[1] = ac_llvm_extract_elem(&ctx->ac, vp, 3);
+
+	/* Do culling. */
+	struct ac_cull_options options = {};
+	options.cull_front = key->opt.cs_cull_front;
+	options.cull_back = key->opt.cs_cull_back;
+	options.cull_view_xy = true;
+	options.cull_view_near_z = CULL_Z && key->opt.cs_cull_z;
+	options.cull_view_far_z = CULL_Z && key->opt.cs_cull_z;
+	options.cull_small_prims = true;
+	options.cull_zero_area = true;
+	options.cull_w = true;
+	options.use_halfz_clip_space = key->opt.cs_halfz_clip_space;
+
+	LLVMValueRef accepted =
+		ac_cull_triangle(&ctx->ac, pos, prim_restart_accepted,
+				 vp_scale, vp_translate, smallprim_precision,
+				 &options);
+
+	LLVMValueRef accepted_threadmask = ac_get_i1_sgpr_mask(&ctx->ac, accepted);
+
+	/* Count the number of active threads by doing bitcount(accepted). */
+	LLVMValueRef num_prims_accepted =
+		ac_build_intrinsic(&ctx->ac, "llvm.ctpop.i64", ctx->i64,
+				   &accepted_threadmask, 1, AC_FUNC_ATTR_READNONE);
+	num_prims_accepted = LLVMBuildTrunc(builder, num_prims_accepted, ctx->i32, "");
+
+	LLVMValueRef start;
+
+	/* Execute atomic_add on the vertex count. */
+	struct si_thread0_section section;
+	si_enter_thread0_section(ctx, &section, thread_id);
+	{
+		if (VERTEX_COUNTER_GDS_MODE == 0) {
+			LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted,
+						LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
+			vertex_counter = si_expand_32bit_pointer(ctx, vertex_counter);
+			start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
+						   vertex_counter, num_indices,
+						   LLVMAtomicOrderingMonotonic, false);
+		} else if (VERTEX_COUNTER_GDS_MODE == 1) {
+			LLVMValueRef num_indices = LLVMBuildMul(builder, num_prims_accepted,
+						LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
+			vertex_counter = LLVMBuildIntToPtr(builder, vertex_counter,
+							   LLVMPointerType(ctx->i32, AC_ADDR_SPACE_GDS), "");
+			start = LLVMBuildAtomicRMW(builder, LLVMAtomicRMWBinOpAdd,
+						   vertex_counter, num_indices,
+						   LLVMAtomicOrderingMonotonic, false);
+		} else if (VERTEX_COUNTER_GDS_MODE == 2) {
+			LLVMValueRef tmp_store = ac_build_alloca_undef(&ctx->ac, ctx->i32, "");
+
+			/* If the draw call was split into multiple subdraws, each using
+			 * a separate draw packet, we need to start counting from 0 for
+			 * the first compute wave of the subdraw.
+			 *
+			 * vertex_counter contains the primitive ID of the first thread
+			 * in the first wave.
+			 *
+			 * This is only correct with VERTEX_COUNTER_GDS_MODE == 2:
+			 */
+			LLVMValueRef is_first_wave =
+				LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
+					      vertex_counter, "");
+
+			/* Store the primitive count for ordered append, not vertex count.
+			 * The idea is to avoid GDS initialization via CP DMA. The shader
+			 * effectively stores the first count using "swap".
+			 *
+			 * if (first_wave) {
+			 *    ds.ordered.swap(num_prims_accepted); // store the first primitive count
+			 *    previous = 0;
+			 * } else {
+			 *    previous = ds.ordered.add(num_prims_accepted) // add the primitive count
+			 * }
+			 */
+			struct lp_build_if_state if_first_wave;
+			lp_build_if(&if_first_wave, &ctx->gallivm, is_first_wave);
+			{
+				/* The GDS address is always 0 with ordered append. */
+				si_build_ds_ordered_op(ctx, "swap", ordered_wave_id,
+						       num_prims_accepted, 0, true, true);
+				LLVMBuildStore(builder, ctx->i32_0, tmp_store);
+			}
+			lp_build_else(&if_first_wave);
+			{
+				LLVMBuildStore(builder,
+					       si_build_ds_ordered_op(ctx, "add", ordered_wave_id,
+								      num_prims_accepted, 0,
+								      true, true),
+					       tmp_store);
+			}
+			lp_build_endif(&if_first_wave);
+
+			start = LLVMBuildLoad(builder, tmp_store, "");
+		}
+	}
+	si_exit_thread0_section(&section, &start);
+
+	/* Write the final vertex count to memory. An EOS/EOP event could do this,
+	 * but those events are super slow and should be avoided if performance
+	 * is a concern. Thanks to GDS ordered append, we can emulate a CS_DONE
+	 * event like this.
+	 */
+	if (VERTEX_COUNTER_GDS_MODE == 2) {
+		struct lp_build_if_state if_last_wave;
+		lp_build_if(&if_last_wave, &ctx->gallivm,
+			    LLVMBuildICmp(builder, LLVMIntEQ, global_thread_id,
+					  last_wave_prim_id, ""));
+		LLVMValueRef count = LLVMBuildAdd(builder, start, num_prims_accepted, "");
+		count = LLVMBuildMul(builder, count,
+				     LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
+
+		/* VI needs to disable caching, so that the CP can see the stored value.
+		 * MTYPE=3 bypasses TC L2.
+		 */
+		if (ctx->screen->info.chip_class <= GFX8) {
+			LLVMValueRef desc[] = {
+				vertex_count_addr,
+				LLVMConstInt(ctx->i32,
+					S_008F04_BASE_ADDRESS_HI(ctx->screen->info.address32_hi), 0),
+				LLVMConstInt(ctx->i32, 4, 0),
+				LLVMConstInt(ctx->i32, S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32) |
+						       S_008F0C_MTYPE(3 /* uncached */), 0),
+			};
+			LLVMValueRef rsrc = ac_build_gather_values(&ctx->ac, desc, 4);
+			ac_build_buffer_store_dword(&ctx->ac, rsrc, count, 1, ctx->i32_0,
+						    ctx->i32_0, 0, true, true, true, false);
+		} else {
+			LLVMBuildStore(builder, count,
+				       si_expand_32bit_pointer(ctx, vertex_count_addr));
+		}
+		lp_build_endif(&if_last_wave);
+	} else {
+		/* For unordered modes that increment a vertex count instead of
+		 * primitive count, convert it into the primitive index.
+		 */
+		start = LLVMBuildUDiv(builder, start,
+				      LLVMConstInt(ctx->i32, vertices_per_prim, 0), "");
+	}
+
+	/* Now we need to store the indices of accepted primitives into
+	 * the output index buffer.
+	 */
+	struct lp_build_if_state if_accepted;
+	lp_build_if(&if_accepted, &ctx->gallivm, accepted);
+	{
+		/* Get the number of bits set before the index of this thread. */
+		LLVMValueRef prim_index = ac_build_mbcnt(&ctx->ac, accepted_threadmask);
+
+		/* We have lowered instancing. Pack the instance ID into vertex ID. */
+		if (key->opt.cs_instancing) {
+			instance_id = LLVMBuildShl(builder, instance_id,
+						   LLVMConstInt(ctx->i32, 16, 0), "");
+
+			for (unsigned i = 0; i < vertices_per_prim; i++)
+				index[i] = LLVMBuildOr(builder, index[i], instance_id, "");
+		}
+
+		if (VERTEX_COUNTER_GDS_MODE == 2) {
+			/* vertex_counter contains the first primitive ID
+			 * for this dispatch. If the draw call was split into
+			 * multiple subdraws, the first primitive ID is > 0
+			 * for subsequent subdraws. Each subdraw uses a different
+			 * portion of the output index buffer. Offset the store
+			 * vindex by the first primitive ID to get the correct
+			 * store address for the subdraw.
+			 */
+			start = LLVMBuildAdd(builder, start, vertex_counter, "");
+		}
+
+		/* Write indices for accepted primitives. */
+		LLVMValueRef buf_args[] = {
+			ac_to_float(&ctx->ac, ac_build_expand_to_vec4(&ctx->ac,
+						ac_build_gather_values(&ctx->ac, index, 3), 3)),
+			output_indexbuf,
+			LLVMBuildAdd(builder, start, prim_index, ""),
+			ctx->i32_0, /* voffset */
+			ctx->i1true, /* glc */
+			LLVMConstInt(ctx->i1, INDEX_STORES_USE_SLC, 0),
+		};
+		ac_build_intrinsic(&ctx->ac, "llvm.amdgcn.buffer.store.format.v4f32",
+				   ctx->voidt, buf_args, 6,
+				   ac_get_store_intr_attribs(true));
+	}
+	lp_build_endif(&if_accepted);
+
+	LLVMBuildRetVoid(builder);
+}
+
+/* Return false if the shader isn't ready. */
+static bool si_shader_select_prim_discard_cs(struct si_context *sctx,
+					     const struct pipe_draw_info *info)
+{
+	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+	struct si_shader_key key;
+
+	/* Primitive restart needs ordered counters. */
+	assert(!info->primitive_restart || VERTEX_COUNTER_GDS_MODE == 2);
+	assert(!info->primitive_restart || info->instance_count == 1);
+
+	memset(&key, 0, sizeof(key));
+	si_shader_selector_key_vs(sctx, sctx->vs_shader.cso, &key, &key.part.vs.prolog);
+	assert(!key.part.vs.prolog.instance_divisor_is_fetched);
+
+	key.part.vs.prolog.unpack_instance_id_from_vertex_id = 0;
+	key.opt.vs_as_prim_discard_cs = 1;
+	key.opt.cs_prim_type = info->mode;
+	key.opt.cs_indexed = info->index_size != 0;
+	key.opt.cs_instancing = info->instance_count > 1;
+	key.opt.cs_primitive_restart = info->primitive_restart;
+	key.opt.cs_provoking_vertex_first = rs->provoking_vertex_first;
+
+	/* Primitive restart with triangle strips needs to preserve primitive
+	 * orientation for cases where front and back primitive orientation matters.
+	 */
+	if (info->primitive_restart) {
+		struct si_shader_selector *ps = sctx->ps_shader.cso;
+
+		key.opt.cs_need_correct_orientation =
+			rs->cull_front != rs->cull_back ||
+			ps->info.uses_frontface ||
+			(rs->two_side && ps->info.colors_read);
+	}
+
+	if (rs->rasterizer_discard) {
+		/* Just for performance testing and analysis of trivial bottlenecks.
+		 * This should result in a very short compute shader. */
+		key.opt.cs_cull_front = 1;
+		key.opt.cs_cull_back = 1;
+	} else {
+		key.opt.cs_cull_front =
+			sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front;
+		key.opt.cs_cull_back =
+			sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back;
+	}
+
+	if (!rs->depth_clamp_any && CULL_Z) {
+		key.opt.cs_cull_z = 1;
+		key.opt.cs_halfz_clip_space = rs->clip_halfz;
+	}
+
+	sctx->cs_prim_discard_state.cso = sctx->vs_shader.cso;
+	sctx->cs_prim_discard_state.current = NULL;
+
+	struct si_compiler_ctx_state compiler_state;
+	compiler_state.compiler = &sctx->compiler;
+	compiler_state.debug = sctx->debug;
+	compiler_state.is_debug_context = sctx->is_debug;
+
+	return si_shader_select_with_key(sctx->screen, &sctx->cs_prim_discard_state,
+					 &compiler_state, &key, -1, true) == 0 &&
+	       /* Disallow compute shaders using the scratch buffer. */
+	       sctx->cs_prim_discard_state.current->config.scratch_bytes_per_wave == 0;
+}
+
+static bool si_initialize_prim_discard_cmdbuf(struct si_context *sctx)
+{
+	if (sctx->index_ring)
+		return true;
+
+	if (!sctx->prim_discard_compute_cs) {
+		struct radeon_winsys *ws = sctx->ws;
+		unsigned gds_size = VERTEX_COUNTER_GDS_MODE == 1 ? GDS_SIZE_UNORDERED :
+				    VERTEX_COUNTER_GDS_MODE == 2 ? 8 : 0;
+		unsigned num_oa_counters = VERTEX_COUNTER_GDS_MODE == 2 ? 2 : 0;
+
+		if (gds_size) {
+			sctx->gds = ws->buffer_create(ws, gds_size, 4,
+						      RADEON_DOMAIN_GDS, 0);
+			if (!sctx->gds)
+				return false;
+
+			ws->cs_add_buffer(sctx->gfx_cs, sctx->gds,
+					  RADEON_USAGE_READWRITE, 0, 0);
+		}
+		if (num_oa_counters) {
+			assert(gds_size);
+			sctx->gds_oa = ws->buffer_create(ws, num_oa_counters,
+							 1, RADEON_DOMAIN_OA, 0);
+			if (!sctx->gds_oa)
+				return false;
+
+			ws->cs_add_buffer(sctx->gfx_cs, sctx->gds_oa,
+					  RADEON_USAGE_READWRITE, 0, 0);
+		}
+
+		sctx->prim_discard_compute_cs =
+			ws->cs_add_parallel_compute_ib(sctx->gfx_cs,
+						       num_oa_counters > 0);
+		if (!sctx->prim_discard_compute_cs)
+			return false;
+	}
+
+	if (!sctx->index_ring) {
+		sctx->index_ring =
+			si_aligned_buffer_create(sctx->b.screen,
+						 SI_RESOURCE_FLAG_UNMAPPABLE,
+						 PIPE_USAGE_DEFAULT,
+						 sctx->index_ring_size_per_ib * 2,
+						 2 * 1024 * 1024);
+		if (!sctx->index_ring)
+			return false;
+	}
+	return true;
+}
+
+static bool si_check_ring_space(struct si_context *sctx, unsigned out_indexbuf_size)
+{
+	return sctx->index_ring_offset +
+	       align(out_indexbuf_size, sctx->screen->info.tcc_cache_line_size) <=
+	       sctx->index_ring_size_per_ib;
+}
+
+enum si_prim_discard_outcome
+si_prepare_prim_discard_or_split_draw(struct si_context *sctx,
+				      const struct pipe_draw_info *info)
+{
+	/* If the compute shader compilation isn't finished, this returns false. */
+	if (!si_shader_select_prim_discard_cs(sctx, info))
+		return SI_PRIM_DISCARD_DISABLED;
+
+	if (!si_initialize_prim_discard_cmdbuf(sctx))
+		return SI_PRIM_DISCARD_DISABLED;
+
+	struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
+	unsigned prim = info->mode;
+	unsigned count = info->count;
+	unsigned instance_count = info->instance_count;
+	unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(prim, count);
+	unsigned num_prims = num_prims_per_instance * instance_count;
+	unsigned out_indexbuf_size = num_prims * 12;
+	bool ring_full = !si_check_ring_space(sctx, out_indexbuf_size);
+	const unsigned split_prims_draw_level = SPLIT_PRIMS_DRAW_LEVEL;
+
+	/* Split draws at the draw call level if the ring is full. This makes
+	 * better use of the ring space.
+	 */
+	if (ring_full &&
+	    num_prims > split_prims_draw_level &&
+	    instance_count == 1 && /* TODO: support splitting instanced draws */
+	    (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) |
+			   (1 << PIPE_PRIM_TRIANGLE_STRIP))) {
+		/* Split draws. */
+		struct pipe_draw_info split_draw = *info;
+		unsigned base_start = split_draw.start;
+
+		if (prim == PIPE_PRIM_TRIANGLES) {
+			unsigned vert_count_per_subdraw = split_prims_draw_level * 3;
+			assert(vert_count_per_subdraw < count);
+
+			for (unsigned start = 0; start < count; start += vert_count_per_subdraw) {
+				split_draw.start = base_start + start;
+				split_draw.count = MIN2(count - start, vert_count_per_subdraw);
+
+				sctx->b.draw_vbo(&sctx->b, &split_draw);
+			}
+		} else if (prim == PIPE_PRIM_TRIANGLE_STRIP) {
+			/* No primitive pair can be split, because strips reverse orientation
+			 * for odd primitives. */
+			STATIC_ASSERT(split_prims_draw_level % 2 == 0);
+
+			unsigned vert_count_per_subdraw = split_prims_draw_level;
+
+			for (unsigned start = 0; start < count - 2; start += vert_count_per_subdraw) {
+				split_draw.start = base_start + start;
+				split_draw.count = MIN2(count - start, vert_count_per_subdraw + 2);
+
+				sctx->b.draw_vbo(&sctx->b, &split_draw);
+
+				if (start == 0 &&
+				    split_draw.primitive_restart &&
+				    sctx->cs_prim_discard_state.current->key.opt.cs_need_correct_orientation)
+					sctx->preserve_prim_restart_gds_at_flush = true;
+			}
+			sctx->preserve_prim_restart_gds_at_flush = false;
+		} else {
+			assert(0);
+		}
+
+		return SI_PRIM_DISCARD_DRAW_SPLIT;
+	}
+
+	/* Just quit if the draw call doesn't fit into the ring and can't be split. */
+	if (out_indexbuf_size > sctx->index_ring_size_per_ib) {
+		if (SI_PRIM_DISCARD_DEBUG)
+			puts("PD failed: draw call too big, can't be split");
+		return SI_PRIM_DISCARD_DISABLED;
+	}
+
+	unsigned num_subdraws = DIV_ROUND_UP(num_prims, SPLIT_PRIMS_PACKET_LEVEL);
+	unsigned need_compute_dw = 11 /* shader */ + 34 /* first draw */ +
+				   24 * (num_subdraws - 1) + /* subdraws */
+				   20; /* leave some space at the end */
+	unsigned need_gfx_dw = si_get_minimum_num_gfx_cs_dwords(sctx);
+
+	if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION)
+		need_gfx_dw += 9; /* NOP(2) + WAIT_REG_MEM(7), then chain */
+	else
+		need_gfx_dw += num_subdraws * 8; /* use REWIND(2) + DRAW(6) */
+
+	if (ring_full ||
+	    (VERTEX_COUNTER_GDS_MODE == 1 && sctx->compute_gds_offset + 8 > GDS_SIZE_UNORDERED) ||
+	    !sctx->ws->cs_check_space(gfx_cs, need_gfx_dw, false)) {
+		/* If the current IB is empty but the size is too small, add a NOP
+		 * packet to force a flush and get a bigger IB.
+		 */
+		if (!radeon_emitted(gfx_cs, sctx->initial_gfx_cs_size) &&
+		    gfx_cs->current.cdw + need_gfx_dw > gfx_cs->current.max_dw) {
+			radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
+			radeon_emit(gfx_cs, 0);
+		}
+
+		si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
+	}
+
+	/* The compute IB is always chained, but we need to call cs_check_space to add more space. */
+	struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
+	bool compute_has_space = sctx->ws->cs_check_space(cs, need_compute_dw, false);
+	assert(compute_has_space);
+	assert(si_check_ring_space(sctx, out_indexbuf_size));
+	return SI_PRIM_DISCARD_ENABLED;
+}
+
+void si_compute_signal_gfx(struct si_context *sctx)
+{
+	struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
+	unsigned writeback_L2_flags = 0;
+
+	/* The writeback L2 flags vary with each chip generation. */
+	/* CI needs to flush vertex indices to memory. */
+	if (sctx->chip_class <= GFX7)
+		writeback_L2_flags = EVENT_TC_WB_ACTION_ENA;
+	else if (sctx->chip_class == GFX8 && VERTEX_COUNTER_GDS_MODE == 0)
+		writeback_L2_flags = EVENT_TC_WB_ACTION_ENA | EVENT_TC_NC_ACTION_ENA;
+
+	if (!sctx->compute_num_prims_in_batch)
+		return;
+
+	assert(sctx->compute_rewind_va);
+
+	/* After the queued dispatches are done and vertex counts are written to
+	 * the gfx IB, signal the gfx IB to continue. CP doesn't wait for
+	 * the dispatches to finish, it only adds the CS_DONE event into the event
+	 * queue.
+	 */
+	si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, writeback_L2_flags,
+			  sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
+			  writeback_L2_flags ? EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM :
+					       EOP_INT_SEL_NONE,
+			  EOP_DATA_SEL_VALUE_32BIT,
+			  NULL,
+			  sctx->compute_rewind_va |
+			  ((uint64_t)sctx->screen->info.address32_hi << 32),
+			  REWIND_SIGNAL_BIT, /* signaling value for the REWIND packet */
+			  SI_NOT_QUERY);
+
+	sctx->compute_rewind_va = 0;
+	sctx->compute_num_prims_in_batch = 0;
+}
+
+/* Dispatch a primitive discard compute shader. */
+void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
+					  const struct pipe_draw_info *info,
+					  unsigned index_size,
+					  unsigned base_vertex,
+					  uint64_t input_indexbuf_va,
+					  unsigned input_indexbuf_num_elements)
+{
+	struct radeon_cmdbuf *gfx_cs = sctx->gfx_cs;
+	struct radeon_cmdbuf *cs = sctx->prim_discard_compute_cs;
+	unsigned num_prims_per_instance = u_decomposed_prims_for_vertices(info->mode, info->count);
+	if (!num_prims_per_instance)
+		return;
+
+	unsigned num_prims = num_prims_per_instance * info->instance_count;
+	unsigned vertices_per_prim, output_indexbuf_format;
+
+	switch (info->mode) {
+	case PIPE_PRIM_TRIANGLES:
+	case PIPE_PRIM_TRIANGLE_STRIP:
+	case PIPE_PRIM_TRIANGLE_FAN:
+		vertices_per_prim = 3;
+		output_indexbuf_format = V_008F0C_BUF_DATA_FORMAT_32_32_32;
+		break;
+	default:
+		unreachable("unsupported primitive type");
+		return;
+	}
+
+	unsigned out_indexbuf_offset;
+	uint64_t output_indexbuf_size = num_prims * vertices_per_prim * 4;
+	bool first_dispatch = !sctx->prim_discard_compute_ib_initialized;
+
+	/* Initialize the compute IB if it's empty. */
+	if (!sctx->prim_discard_compute_ib_initialized) {
+		/* 1) State initialization. */
+		sctx->compute_gds_offset = 0;
+		sctx->compute_ib_last_shader = NULL;
+
+		if (sctx->last_ib_barrier_fence) {
+			assert(!sctx->last_ib_barrier_buf);
+			sctx->ws->cs_add_fence_dependency(gfx_cs,
+							  sctx->last_ib_barrier_fence,
+							  RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY);
+		}
+
+		/* 2) IB initialization. */
+		/* Restore the GDS prim restart counter if needed. */
+		if (sctx->preserve_prim_restart_gds_at_flush) {
+			si_cp_copy_data(sctx, cs,
+					COPY_DATA_GDS, NULL, 4,
+					COPY_DATA_SRC_MEM, sctx->wait_mem_scratch, 4);
+		}
+
+		si_emit_initial_compute_regs(sctx, cs);
+
+		radeon_set_sh_reg(cs, R_00B860_COMPUTE_TMPRING_SIZE,
+				  S_00B860_WAVES(sctx->scratch_waves) |
+				  S_00B860_WAVESIZE(0)); /* no scratch */
+
+		/* Only 1D grids are launched. */
+		radeon_set_sh_reg_seq(cs, R_00B820_COMPUTE_NUM_THREAD_Y, 2);
+		radeon_emit(cs, S_00B820_NUM_THREAD_FULL(1) |
+				S_00B820_NUM_THREAD_PARTIAL(1));
+		radeon_emit(cs, S_00B824_NUM_THREAD_FULL(1) |
+				S_00B824_NUM_THREAD_PARTIAL(1));
+
+		radeon_set_sh_reg_seq(cs, R_00B814_COMPUTE_START_Y, 2);
+		radeon_emit(cs, 0);
+		radeon_emit(cs, 0);
+
+		/* Disable ordered alloc for OA resources. */
+		for (unsigned i = 0; i < 2; i++) {
+			radeon_set_uconfig_reg_seq(cs, R_031074_GDS_OA_CNTL, 3);
+			radeon_emit(cs, S_031074_INDEX(i));
+			radeon_emit(cs, 0);
+			radeon_emit(cs, S_03107C_ENABLE(0));
+		}
+
+		if (sctx->last_ib_barrier_buf) {
+			assert(!sctx->last_ib_barrier_fence);
+			radeon_add_to_buffer_list(sctx, gfx_cs, sctx->last_ib_barrier_buf,
+						  RADEON_USAGE_READ, RADEON_PRIO_FENCE);
+			si_cp_wait_mem(sctx, cs,
+				       sctx->last_ib_barrier_buf->gpu_address +
+				       sctx->last_ib_barrier_buf_offset, 1, 1,
+				       WAIT_REG_MEM_EQUAL);
+		}
+
+		sctx->prim_discard_compute_ib_initialized = true;
+	}
+
+	/* Allocate the output index buffer. */
+	output_indexbuf_size = align(output_indexbuf_size,
+				     sctx->screen->info.tcc_cache_line_size);
+	assert(sctx->index_ring_offset + output_indexbuf_size <= sctx->index_ring_size_per_ib);
+	out_indexbuf_offset = sctx->index_ring_base + sctx->index_ring_offset;
+	sctx->index_ring_offset += output_indexbuf_size;
+
+	radeon_add_to_buffer_list(sctx, gfx_cs, sctx->index_ring, RADEON_USAGE_READWRITE,
+				  RADEON_PRIO_SHADER_RW_BUFFER);
+	uint64_t out_indexbuf_va = sctx->index_ring->gpu_address + out_indexbuf_offset;
+
+	/* Prepare index buffer descriptors. */
+	struct si_resource *indexbuf_desc = NULL;
+	unsigned indexbuf_desc_offset;
+	unsigned desc_size = 12 * 4;
+	uint32_t *desc;
+
+	u_upload_alloc(sctx->b.const_uploader, 0, desc_size,
+		       si_optimal_tcc_alignment(sctx, desc_size),
+		       &indexbuf_desc_offset, (struct pipe_resource**)&indexbuf_desc,
+		       (void**)&desc);
+	radeon_add_to_buffer_list(sctx, gfx_cs, indexbuf_desc, RADEON_USAGE_READ,
+				  RADEON_PRIO_DESCRIPTORS);
+
+	/* Input index buffer. */
+	desc[0] = input_indexbuf_va;
+	desc[1] = S_008F04_BASE_ADDRESS_HI(input_indexbuf_va >> 32) |
+		  S_008F04_STRIDE(index_size);
+	desc[2] = input_indexbuf_num_elements * (sctx->chip_class == GFX8 ? index_size : 1);
+	desc[3] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+		  S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
+		  S_008F0C_DATA_FORMAT(index_size == 1 ? V_008F0C_BUF_DATA_FORMAT_8 :
+				       index_size == 2 ? V_008F0C_BUF_DATA_FORMAT_16 :
+							 V_008F0C_BUF_DATA_FORMAT_32);
+
+	/* Output index buffer. */
+	desc[4] = out_indexbuf_va;
+	desc[5] = S_008F04_BASE_ADDRESS_HI(out_indexbuf_va >> 32) |
+		  S_008F04_STRIDE(vertices_per_prim * 4);
+	desc[6] = num_prims * (sctx->chip_class == GFX8 ? vertices_per_prim * 4 : 1);
+	desc[7] = S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) |
+		  S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) |
+		  S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) |
+		  S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_0) |
+		  S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_UINT) |
+		  S_008F0C_DATA_FORMAT(output_indexbuf_format);
+
+	/* Viewport state.
+	 * This is needed by the small primitive culling, because it's done
+	 * in screen space.
+	 */
+	float scale[2], translate[2];
+
+	scale[0] = sctx->viewports.states[0].scale[0];
+	scale[1] = sctx->viewports.states[0].scale[1];
+	translate[0] = sctx->viewports.states[0].translate[0];
+	translate[1] = sctx->viewports.states[0].translate[1];
+
+	/* The viewport shouldn't flip the X axis for the small prim culling to work. */
+	assert(-scale[0] + translate[0] <= scale[0] + translate[0]);
+
+	/* If the Y axis is inverted (OpenGL default framebuffer), reverse it.
+	 * This is because the viewport transformation inverts the clip space
+	 * bounding box, so min becomes max, which breaks small primitive
+	 * culling.
+	 */
+	if (sctx->viewports.y_inverted) {
+		scale[1] = -scale[1];
+		translate[1] = -translate[1];
+	}
+
+	/* Scale the framebuffer up, so that samples become pixels and small
+	 * primitive culling is the same for all sample counts.
+	 * This only works with the standard DX sample positions, because
+	 * the samples are evenly spaced on both X and Y axes.
+	 */
+	unsigned num_samples = sctx->framebuffer.nr_samples;
+	assert(num_samples >= 1);
+
+	for (unsigned i = 0; i < 2; i++) {
+		scale[i] *= num_samples;
+		translate[i] *= num_samples;
+	}
+
+	desc[8] = fui(scale[0]);
+	desc[9] = fui(scale[1]);
+	desc[10] = fui(translate[0]);
+	desc[11] = fui(translate[1]);
+
+	/* Better subpixel precision increases the efficiency of small
+	 * primitive culling. */
+	unsigned quant_mode = sctx->viewports.as_scissor[0].quant_mode;
+	float small_prim_cull_precision;
+
+	if (quant_mode == SI_QUANT_MODE_12_12_FIXED_POINT_1_4096TH)
+		small_prim_cull_precision = num_samples / 4096.0;
+	else if (quant_mode == SI_QUANT_MODE_14_10_FIXED_POINT_1_1024TH)
+		small_prim_cull_precision = num_samples / 1024.0;
+	else
+		small_prim_cull_precision = num_samples / 256.0;
+
+	/* Set user data SGPRs. */
+	/* This can't be greater than 14 if we want the fastest launch rate. */
+	unsigned user_sgprs = 13;
+
+	uint64_t index_buffers_va = indexbuf_desc->gpu_address + indexbuf_desc_offset;
+	unsigned vs_const_desc = si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX);
+	unsigned vs_sampler_desc = si_sampler_and_image_descriptors_idx(PIPE_SHADER_VERTEX);
+	uint64_t vs_const_desc_va = sctx->descriptors[vs_const_desc].gpu_address;
+	uint64_t vs_sampler_desc_va = sctx->descriptors[vs_sampler_desc].gpu_address;
+	uint64_t vb_desc_va = sctx->vb_descriptors_buffer ?
+				      sctx->vb_descriptors_buffer->gpu_address +
+				      sctx->vb_descriptors_offset : 0;
+	unsigned gds_offset, gds_size;
+	struct si_fast_udiv_info32 num_prims_udiv = {};
+
+	if (info->instance_count > 1)
+		num_prims_udiv = si_compute_fast_udiv_info32(num_prims_per_instance, 31);
+
+	/* Limitations on how these two are packed in the user SGPR. */
+	assert(num_prims_udiv.post_shift < 32);
+	assert(num_prims_per_instance < 1 << 27);
+
+	si_resource_reference(&indexbuf_desc, NULL);
+
+	if (VERTEX_COUNTER_GDS_MODE == 1) {
+		gds_offset = sctx->compute_gds_offset;
+		gds_size = info->primitive_restart ? 8 : 4;
+		sctx->compute_gds_offset += gds_size;
+
+		/* Reset the counters in GDS for the first dispatch using WRITE_DATA.
+		 * The remainder of the GDS will be cleared after the dispatch packet
+		 * in parallel with compute shaders.
+		 */
+		if (first_dispatch) {
+			radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 2 + gds_size/4, 0));
+			radeon_emit(cs, S_370_DST_SEL(V_370_GDS) | S_370_WR_CONFIRM(1));
+			radeon_emit(cs, gds_offset);
+			radeon_emit(cs, 0);
+			radeon_emit(cs, 0); /* value to write */
+			if (gds_size == 8)
+				radeon_emit(cs, 0);
+		}
+	}
+
+	/* Set shader registers. */
+	struct si_shader *shader = sctx->cs_prim_discard_state.current;
+
+	if (shader != sctx->compute_ib_last_shader) {
+		radeon_add_to_buffer_list(sctx, gfx_cs, shader->bo, RADEON_USAGE_READ,
+					  RADEON_PRIO_SHADER_BINARY);
+		uint64_t shader_va = shader->bo->gpu_address;
+
+		assert(shader->config.scratch_bytes_per_wave == 0);
+		assert(shader->config.num_vgprs * WAVES_PER_TG <= 256 * 4);
+
+		radeon_set_sh_reg_seq(cs, R_00B830_COMPUTE_PGM_LO, 2);
+		radeon_emit(cs, shader_va >> 8);
+		radeon_emit(cs, S_00B834_DATA(shader_va >> 40));
+
+		radeon_set_sh_reg_seq(cs, R_00B848_COMPUTE_PGM_RSRC1, 2);
+		radeon_emit(cs, S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) |
+				S_00B848_SGPRS((shader->config.num_sgprs - 1) / 8) |
+				S_00B848_FLOAT_MODE(shader->config.float_mode) |
+				S_00B848_DX10_CLAMP(1));
+		radeon_emit(cs, S_00B84C_SCRATCH_EN(0 /* no scratch */) |
+				S_00B84C_USER_SGPR(user_sgprs) |
+				S_00B84C_TGID_X_EN(1 /* only blockID.x is used */) |
+				S_00B84C_TG_SIZE_EN(VERTEX_COUNTER_GDS_MODE == 2 /* need the wave ID */) |
+				S_00B84C_TIDIG_COMP_CNT(0 /* only threadID.x is used */) |
+				S_00B84C_LDS_SIZE(shader->config.lds_size));
+
+		radeon_set_sh_reg(cs, R_00B854_COMPUTE_RESOURCE_LIMITS,
+			si_get_compute_resource_limits(sctx->screen, WAVES_PER_TG,
+						       MAX_WAVES_PER_SH, THREADGROUPS_PER_CU));
+		sctx->compute_ib_last_shader = shader;
+	}
+
+	STATIC_ASSERT(SPLIT_PRIMS_PACKET_LEVEL % THREADGROUP_SIZE == 0);
+
+	/* Big draw calls are split into smaller dispatches and draw packets. */
+	for (unsigned start_prim = 0; start_prim < num_prims; start_prim += SPLIT_PRIMS_PACKET_LEVEL) {
+		unsigned num_subdraw_prims;
+
+		if (start_prim + SPLIT_PRIMS_PACKET_LEVEL < num_prims)
+			num_subdraw_prims = SPLIT_PRIMS_PACKET_LEVEL;
+		else
+			num_subdraw_prims = num_prims - start_prim;
+
+		/* Small dispatches are executed back to back until a specific primitive
+		 * count is reached. Then, a CS_DONE is inserted to signal the gfx IB
+		 * to start drawing the batch. This batching adds latency to the gfx IB,
+		 * but CS_DONE and REWIND are too slow.
+		 */
+		if (sctx->compute_num_prims_in_batch + num_subdraw_prims > PRIMS_PER_BATCH)
+			si_compute_signal_gfx(sctx);
+
+		if (sctx->compute_num_prims_in_batch == 0) {
+			assert((gfx_cs->gpu_address >> 32) == sctx->screen->info.address32_hi);
+			sctx->compute_rewind_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 1) * 4;
+
+			if (sctx->chip_class <= GFX7 || FORCE_REWIND_EMULATION) {
+				radeon_emit(gfx_cs, PKT3(PKT3_NOP, 0, 0));
+				radeon_emit(gfx_cs, 0);
+
+				si_cp_wait_mem(sctx, gfx_cs,
+					       sctx->compute_rewind_va |
+					       (uint64_t)sctx->screen->info.address32_hi << 32,
+					       REWIND_SIGNAL_BIT, REWIND_SIGNAL_BIT,
+					       WAIT_REG_MEM_EQUAL | WAIT_REG_MEM_PFP);
+
+				/* Use INDIRECT_BUFFER to chain to a different buffer
+				 * to discard the CP prefetch cache.
+				 */
+				sctx->ws->cs_check_space(gfx_cs, 0, true);
+			} else {
+				radeon_emit(gfx_cs, PKT3(PKT3_REWIND, 0, 0));
+				radeon_emit(gfx_cs, 0);
+			}
+		}
+
+		sctx->compute_num_prims_in_batch += num_subdraw_prims;
+
+		uint32_t count_va = gfx_cs->gpu_address + (gfx_cs->current.cdw + 4) * 4;
+		uint64_t index_va = out_indexbuf_va + start_prim * 12;
+
+		/* Emit the draw packet into the gfx IB. */
+		radeon_emit(gfx_cs, PKT3(PKT3_DRAW_INDEX_2, 4, 0));
+		radeon_emit(gfx_cs, num_prims * vertices_per_prim);
+		radeon_emit(gfx_cs, index_va);
+		radeon_emit(gfx_cs, index_va >> 32);
+		radeon_emit(gfx_cs, 0);
+		radeon_emit(gfx_cs, V_0287F0_DI_SRC_SEL_DMA);
+
+		/* Continue with the compute IB. */
+		if (start_prim == 0) {
+			uint32_t gds_prim_restart_continue_bit = 0;
+
+			if (sctx->preserve_prim_restart_gds_at_flush) {
+				assert(info->primitive_restart &&
+				       info->mode == PIPE_PRIM_TRIANGLE_STRIP);
+				assert(start_prim < 1 << 31);
+				gds_prim_restart_continue_bit = 1 << 31;
+			}
+
+			radeon_set_sh_reg_seq(cs, R_00B900_COMPUTE_USER_DATA_0, user_sgprs);
+			radeon_emit(cs, index_buffers_va);
+			radeon_emit(cs,
+				    VERTEX_COUNTER_GDS_MODE == 0 ? count_va :
+				    VERTEX_COUNTER_GDS_MODE == 1 ? gds_offset :
+								   start_prim |
+								   gds_prim_restart_continue_bit);
+			radeon_emit(cs, start_prim + num_subdraw_prims - 1);
+			radeon_emit(cs, count_va);
+			radeon_emit(cs, vb_desc_va);
+			radeon_emit(cs, vs_const_desc_va);
+			radeon_emit(cs, vs_sampler_desc_va);
+			radeon_emit(cs, base_vertex);
+			radeon_emit(cs, info->start_instance);
+			radeon_emit(cs, num_prims_udiv.multiplier);
+			radeon_emit(cs, num_prims_udiv.post_shift |
+					(num_prims_per_instance << 5));
+			radeon_emit(cs, info->restart_index);
+			/* small-prim culling precision (same as rasterizer precision = QUANT_MODE) */
+			radeon_emit(cs, fui(small_prim_cull_precision));
+		} else {
+			assert(VERTEX_COUNTER_GDS_MODE == 2);
+			/* Only update the SGPRs that changed. */
+			radeon_set_sh_reg_seq(cs, R_00B904_COMPUTE_USER_DATA_1, 3);
+			radeon_emit(cs, start_prim);
+			radeon_emit(cs, start_prim + num_subdraw_prims - 1);
+			radeon_emit(cs, count_va);
+		}
+
+		/* Set grid dimensions. */
+		unsigned start_block = start_prim / THREADGROUP_SIZE;
+		unsigned num_full_blocks = num_subdraw_prims / THREADGROUP_SIZE;
+		unsigned partial_block_size = num_subdraw_prims % THREADGROUP_SIZE;
+
+		radeon_set_sh_reg(cs, R_00B810_COMPUTE_START_X, start_block);
+		radeon_set_sh_reg(cs, R_00B81C_COMPUTE_NUM_THREAD_X,
+				  S_00B81C_NUM_THREAD_FULL(THREADGROUP_SIZE) |
+				  S_00B81C_NUM_THREAD_PARTIAL(partial_block_size));
+
+		radeon_emit(cs, PKT3(PKT3_DISPATCH_DIRECT, 3, 0) |
+				PKT3_SHADER_TYPE_S(1));
+		radeon_emit(cs, start_block + num_full_blocks + !!partial_block_size);
+		radeon_emit(cs, 1);
+		radeon_emit(cs, 1);
+		radeon_emit(cs, S_00B800_COMPUTE_SHADER_EN(1) |
+				S_00B800_PARTIAL_TG_EN(!!partial_block_size) |
+				S_00B800_ORDERED_APPEND_ENBL(VERTEX_COUNTER_GDS_MODE == 2) |
+				S_00B800_ORDER_MODE(0 /* launch in order */));
+
+		/* This is only for unordered append. Ordered append writes this from
+		 * the shader.
+		 *
+		 * Note that EOP and EOS events are super slow, so emulating the event
+		 * in a shader is an important optimization.
+		 */
+		if (VERTEX_COUNTER_GDS_MODE == 1) {
+			si_cp_release_mem(sctx, cs, V_028A90_CS_DONE, 0,
+					  sctx->chip_class <= GFX8 ? EOP_DST_SEL_MEM : EOP_DST_SEL_TC_L2,
+					  EOP_INT_SEL_NONE,
+					  EOP_DATA_SEL_GDS,
+					  NULL,
+					  count_va | ((uint64_t)sctx->screen->info.address32_hi << 32),
+					  EOP_DATA_GDS(gds_offset / 4, 1),
+					  SI_NOT_QUERY);
+
+			/* Now that compute shaders are running, clear the remainder of GDS. */
+			if (first_dispatch) {
+				unsigned offset = gds_offset + gds_size;
+				si_cp_dma_clear_buffer(sctx, cs, NULL, offset,
+						       GDS_SIZE_UNORDERED - offset,
+						       0,
+						       SI_CPDMA_SKIP_CHECK_CS_SPACE |
+						       SI_CPDMA_SKIP_GFX_SYNC |
+						       SI_CPDMA_SKIP_SYNC_BEFORE,
+						       SI_COHERENCY_NONE, L2_BYPASS);
+			}
+		}
+		first_dispatch = false;
+
+		assert(cs->current.cdw <= cs->current.max_dw);
+		assert(gfx_cs->current.cdw <= gfx_cs->current.max_dw);
+	}
+}
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 37ab31a..e83016f 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -249,8 +249,10 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct radeon_cmdbuf *cs,
 		sdst->TC_L2_dirty = true;
 
 	/* If it's not a framebuffer fast clear... */
-	if (coher == SI_COHERENCY_SHADER)
+	if (coher == SI_COHERENCY_SHADER) {
 		sctx->num_cp_dma_calls++;
+		si_prim_discard_signal_next_compute_ib_start(sctx);
+	}
 }
 
 /**
@@ -405,8 +407,10 @@ void si_cp_dma_copy_buffer(struct si_context *sctx,
 		si_resource(dst)->TC_L2_dirty = true;
 
 	/* If it's not a prefetch or GDS copy... */
-	if (dst && src && (dst != src || dst_offset != src_offset))
+	if (dst && src && (dst != src || dst_offset != src_offset)) {
 		sctx->num_cp_dma_calls++;
+		si_prim_discard_signal_next_compute_ib_start(sctx);
+	}
 }
 
 void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf,
diff --git a/src/gallium/drivers/radeonsi/si_debug.c b/src/gallium/drivers/radeonsi/si_debug.c
index bd85fc4..fc2d731 100644
--- a/src/gallium/drivers/radeonsi/si_debug.c
+++ b/src/gallium/drivers/radeonsi/si_debug.c
@@ -337,6 +337,7 @@ struct si_log_chunk_cs {
 	struct si_saved_cs *cs;
 	bool dump_bo_list;
 	unsigned gfx_begin, gfx_end;
+	unsigned compute_begin, compute_end;
 };
 
 static void si_log_chunk_type_cs_destroy(void *data)
@@ -394,6 +395,7 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
 	struct si_context *ctx = chunk->ctx;
 	struct si_saved_cs *scs = chunk->cs;
 	int last_trace_id = -1;
+	int last_compute_trace_id = -1;
 
 	/* We are expecting that the ddebug pipe has already
 	 * waited for the context, so this buffer should be idle.
@@ -403,8 +405,10 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
 					      NULL,
 					      PIPE_TRANSFER_UNSYNCHRONIZED |
 					      PIPE_TRANSFER_READ);
-	if (map)
+	if (map) {
 		last_trace_id = map[0];
+		last_compute_trace_id = map[1];
+	}
 
 	if (chunk->gfx_end != chunk->gfx_begin) {
 		if (chunk->gfx_begin == 0) {
@@ -432,6 +436,21 @@ static void si_log_chunk_type_cs_print(void *data, FILE *f)
 		}
 	}
 
+	if (chunk->compute_end != chunk->compute_begin) {
+		assert(ctx->prim_discard_compute_cs);
+
+		if (scs->flushed) {
+			ac_parse_ib(f, scs->compute.ib + chunk->compute_begin,
+				    chunk->compute_end - chunk->compute_begin,
+				    &last_compute_trace_id, map ? 1 : 0, "Compute IB", ctx->chip_class,
+				    NULL, NULL);
+		} else {
+			si_parse_current_ib(f, ctx->prim_discard_compute_cs, chunk->compute_begin,
+					    chunk->compute_end, &last_compute_trace_id,
+					    map ? 1 : 0, "Compute IB", ctx->chip_class);
+		}
+	}
+
 	if (chunk->dump_bo_list) {
 		fprintf(f, "Flushing. Time: ");
 		util_dump_ns(f, scs->time_flush);
@@ -452,9 +471,14 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log,
 
 	struct si_saved_cs *scs = ctx->current_saved_cs;
 	unsigned gfx_cur = ctx->gfx_cs->prev_dw + ctx->gfx_cs->current.cdw;
+	unsigned compute_cur = 0;
+
+	if (ctx->prim_discard_compute_cs)
+		compute_cur = ctx->prim_discard_compute_cs->prev_dw + ctx->prim_discard_compute_cs->current.cdw;
 
 	if (!dump_bo_list &&
-	    gfx_cur == scs->gfx_last_dw)
+	    gfx_cur == scs->gfx_last_dw &&
+	    compute_cur == scs->compute_last_dw)
 		return;
 
 	struct si_log_chunk_cs *chunk = calloc(1, sizeof(*chunk));
@@ -467,6 +491,10 @@ static void si_log_cs(struct si_context *ctx, struct u_log_context *log,
 	chunk->gfx_end = gfx_cur;
 	scs->gfx_last_dw = gfx_cur;
 
+	chunk->compute_begin = scs->compute_last_dw;
+	chunk->compute_end = compute_cur;
+	scs->compute_last_dw = compute_cur;
+
 	u_log_chunk(log, &si_log_chunk_type_cs, chunk);
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_fence.c b/src/gallium/drivers/radeonsi/si_fence.c
index fdd10d8..1d67fd8 100644
--- a/src/gallium/drivers/radeonsi/si_fence.c
+++ b/src/gallium/drivers/radeonsi/si_fence.c
@@ -80,7 +80,7 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
 		       EOP_INT_SEL(int_sel) |
 		       EOP_DATA_SEL(data_sel);
 
-	if (ctx->chip_class >= GFX9) {
+	if (ctx->chip_class >= GFX9 || cs == ctx->prim_discard_compute_cs) {
 		/* A ZPASS_DONE or PIXEL_STAT_DUMP_EVENT (of the DB occlusion
 		 * counters) must immediately precede every timestamp event to
 		 * prevent a GPU hang on GFX9.
@@ -89,6 +89,7 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
 		 * always do ZPASS_DONE before the timestamp.
 		 */
 		if (ctx->chip_class == GFX9 &&
+		    cs != ctx->prim_discard_compute_cs &&
 		    query_type != PIPE_QUERY_OCCLUSION_COUNTER &&
 		    query_type != PIPE_QUERY_OCCLUSION_PREDICATE &&
 		    query_type != PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE) {
@@ -105,14 +106,15 @@ void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs,
 						  RADEON_USAGE_WRITE, RADEON_PRIO_QUERY);
 		}
 
-		radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, 6, 0));
+		radeon_emit(cs, PKT3(PKT3_RELEASE_MEM, ctx->chip_class >= GFX9 ? 6 : 5, 0));
 		radeon_emit(cs, op);
 		radeon_emit(cs, sel);
 		radeon_emit(cs, va);		/* address lo */
 		radeon_emit(cs, va >> 32);	/* address hi */
 		radeon_emit(cs, new_fence);	/* immediate data lo */
 		radeon_emit(cs, 0); /* immediate data hi */
-		radeon_emit(cs, 0); /* unused */
+		if (ctx->chip_class >= GFX9)
+			radeon_emit(cs, 0); /* unused */
 	} else {
 		if (ctx->chip_class == GFX7 ||
 		    ctx->chip_class == GFX8) {
diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
index 121ab75..de09099 100644
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -24,6 +24,8 @@
  */
 
 #include "si_pipe.h"
+#include "si_build_pm4.h"
+#include "sid.h"
 
 #include "util/os_time.h"
 #include "util/u_upload_mgr.h"
@@ -134,6 +136,24 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
 	if (radeon_emitted(ctx->dma_cs, 0))
 		si_flush_dma_cs(ctx, flags, NULL);
 
+	if (radeon_emitted(ctx->prim_discard_compute_cs, 0)) {
+		struct radeon_cmdbuf *compute_cs = ctx->prim_discard_compute_cs;
+		si_compute_signal_gfx(ctx);
+
+		/* Make sure compute shaders are idle before leaving the IB, so that
+		 * the next IB doesn't overwrite GDS that might be in use. */
+		radeon_emit(compute_cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
+		radeon_emit(compute_cs, EVENT_TYPE(V_028A90_CS_PARTIAL_FLUSH) |
+					EVENT_INDEX(4));
+
+		/* Save the GDS prim restart counter if needed. */
+		if (ctx->preserve_prim_restart_gds_at_flush) {
+			si_cp_copy_data(ctx, compute_cs,
+					COPY_DATA_DST_MEM, ctx->wait_mem_scratch, 4,
+					COPY_DATA_GDS, NULL, 4);
+		}
+	}
+
 	if (ctx->has_graphics) {
 		if (!LIST_IS_EMPTY(&ctx->active_queries))
 			si_suspend_queries(ctx);
@@ -168,6 +188,32 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
 		si_log_hw_flush(ctx);
 	}
 
+	if (si_compute_prim_discard_enabled(ctx)) {
+		/* The compute IB can start after the previous gfx IB starts. */
+		if (radeon_emitted(ctx->prim_discard_compute_cs, 0) &&
+		    ctx->last_gfx_fence) {
+			ctx->ws->cs_add_fence_dependency(ctx->gfx_cs,
+							 ctx->last_gfx_fence,
+							 RADEON_DEPENDENCY_PARALLEL_COMPUTE_ONLY |
+							 RADEON_DEPENDENCY_START_FENCE);
+		}
+
+		/* Remember the last execution barrier. It's in the IB.
+		 * It will signal the start of the next compute IB.
+		 */
+		if (flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW &&
+		    ctx->last_pkt3_write_data) {
+			*ctx->last_pkt3_write_data = PKT3(PKT3_WRITE_DATA, 3, 0);
+			ctx->last_pkt3_write_data = NULL;
+
+			si_resource_reference(&ctx->last_ib_barrier_buf, ctx->barrier_buf);
+			ctx->last_ib_barrier_buf_offset = ctx->barrier_buf_offset;
+			si_resource_reference(&ctx->barrier_buf, NULL);
+
+			ws->fence_reference(&ctx->last_ib_barrier_fence, NULL);
+		}
+	}
+
 	/* Flush the CS. */
 	ws->cs_flush(cs, flags, &ctx->last_gfx_fence);
 	if (fence)
@@ -175,6 +221,17 @@ void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
 
 	ctx->num_gfx_cs_flushes++;
 
+	if (si_compute_prim_discard_enabled(ctx)) {
+		/* Remember the last execution barrier, which is the last fence
+		 * in this case.
+		 */
+		if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) {
+			ctx->last_pkt3_write_data = NULL;
+			si_resource_reference(&ctx->last_ib_barrier_buf, NULL);
+			ws->fence_reference(&ctx->last_ib_barrier_fence, ctx->last_gfx_fence);
+		}
+	}
+
 	/* Check VM faults if needed. */
 	if (ctx->screen->debug_flags & DBG(CHECK_VM)) {
 		/* Use conservative timeout 800ms, after which we won't wait any
@@ -226,6 +283,16 @@ void si_begin_new_gfx_cs(struct si_context *ctx)
 	if (ctx->is_debug)
 		si_begin_gfx_cs_debug(ctx);
 
+	if (ctx->gds) {
+		ctx->ws->cs_add_buffer(ctx->gfx_cs, ctx->gds,
+				       RADEON_USAGE_READWRITE, 0, 0);
+		if (ctx->gds_oa) {
+			ctx->ws->cs_add_buffer(ctx->gfx_cs, ctx->gds_oa,
+					       RADEON_USAGE_READWRITE, 0, 0);
+		}
+	}
+
+
 	/* Always invalidate caches at the beginning of IBs, because external
 	 * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our
 	 * buffers.
@@ -352,6 +419,19 @@ void si_begin_new_gfx_cs(struct si_context *ctx)
 	ctx->last_num_tcs_input_cp = -1;
 	ctx->last_ls_hs_config = -1; /* impossible value */
 
+	ctx->prim_discard_compute_ib_initialized = false;
+
+        /* Compute-based primitive discard:
+         *   The index ring is divided into 2 halves. Switch between the halves
+         *   in the same fashion as doublebuffering.
+         */
+        if (ctx->index_ring_base)
+                ctx->index_ring_base = 0;
+        else
+                ctx->index_ring_base = ctx->index_ring_size_per_ib;
+
+        ctx->index_ring_offset = 0;
+
 	if (has_clear_state) {
 		ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0x00000000;
 		ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0x00000000;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index d9dae83..9f7159d 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -80,6 +80,9 @@ static const struct debug_named_value debug_options[] = {
 	{ "zerovram", DBG(ZERO_VRAM), "Clear VRAM allocations." },
 
 	/* 3D engine options: */
+	{ "alwayspd", DBG(ALWAYS_PD), "Always enable the primitive discard compute shader." },
+	{ "pd", DBG(PD), "Enable the primitive discard compute shader for large draw calls." },
+	{ "nopd", DBG(NO_PD), "Disable the primitive discard compute shader." },
 	{ "switch_on_eop", DBG(SWITCH_ON_EOP), "Program WD/IA to switch on end-of-packet." },
 	{ "nooutoforder", DBG(NO_OUT_OF_ORDER), "Disable out-of-order rasterization" },
 	{ "nodpbb", DBG(NO_DPBB), "Disable DPBB." },
@@ -255,7 +258,13 @@ static void si_destroy_context(struct pipe_context *context)
 
 	sctx->ws->fence_reference(&sctx->last_gfx_fence, NULL);
 	sctx->ws->fence_reference(&sctx->last_sdma_fence, NULL);
+	sctx->ws->fence_reference(&sctx->last_ib_barrier_fence, NULL);
 	si_resource_reference(&sctx->eop_bug_scratch, NULL);
+	si_resource_reference(&sctx->index_ring, NULL);
+	si_resource_reference(&sctx->barrier_buf, NULL);
+	si_resource_reference(&sctx->last_ib_barrier_buf, NULL);
+	pb_reference(&sctx->gds, NULL);
+	pb_reference(&sctx->gds_oa, NULL);
 
 	si_destroy_compiler(&sctx->compiler);
 
@@ -533,6 +542,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
 		sctx->blitter->skip_viewport_restore = true;
 
 		si_init_draw_functions(sctx);
+		si_initialize_prim_discard_tunables(sctx);
 	}
 
 	/* Initialize SDMA functions. */
@@ -554,7 +564,7 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
 
 	if (sctx->chip_class >= GFX9) {
 		sctx->wait_mem_scratch = si_resource(
-			pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, 4));
+			pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, 8));
 		if (!sctx->wait_mem_scratch)
 			goto fail;
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 488ae74..0d00a9b 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -39,7 +39,7 @@
 #endif
 
 #define ATI_VENDOR_ID			0x1002
-
+#define SI_PRIM_DISCARD_DEBUG		0
 #define SI_NOT_QUERY			0xffffffff
 
 /* The base vertex and primitive restart can be any number, but we must pick
@@ -165,6 +165,9 @@ enum {
 	DBG_ZERO_VRAM,
 
 	/* 3D engine options: */
+	DBG_ALWAYS_PD,
+	DBG_PD,
+	DBG_NO_PD,
 	DBG_SWITCH_ON_EOP,
 	DBG_NO_OUT_OF_ORDER,
 	DBG_NO_DPBB,
@@ -209,6 +212,7 @@ enum si_coherency {
 };
 
 struct si_compute;
+struct si_shader_context;
 struct hash_table;
 struct u_suballocator;
 
@@ -675,6 +679,7 @@ struct si_signed_scissor {
 struct si_viewports {
 	struct pipe_viewport_state	states[SI_MAX_VIEWPORTS];
 	struct si_signed_scissor	as_scissor[SI_MAX_VIEWPORTS];
+	bool				y_inverted;
 };
 
 struct si_clip_state {
@@ -780,10 +785,12 @@ struct si_saved_cs {
 	struct pipe_reference	reference;
 	struct si_context	*ctx;
 	struct radeon_saved_cs	gfx;
+	struct radeon_saved_cs	compute;
 	struct si_resource	*trace_buf;
 	unsigned		trace_id;
 
 	unsigned		gfx_last_dw;
+	unsigned		compute_last_dw;
 	bool			flushed;
 	int64_t			time_flush;
 };
@@ -839,6 +846,7 @@ struct si_context {
 	struct pipe_debug_callback	debug;
 	struct ac_llvm_compiler		compiler; /* only non-threaded compilation */
 	struct si_shader_ctx_state	fixed_func_tcs_shader;
+	/* Offset 0: EOP flush number; Offset 4: GDS prim restart counter */
 	struct si_resource		*wait_mem_scratch;
 	unsigned			wait_mem_number;
 	uint16_t			prefetch_L2_mask;
@@ -859,6 +867,31 @@ struct si_context {
 	uint64_t			vram;
 	uint64_t			gtt;
 
+	/* Compute-based primitive discard. */
+	unsigned			prim_discard_vertex_count_threshold;
+	struct pb_buffer		*gds;
+	struct pb_buffer		*gds_oa;
+	struct radeon_cmdbuf		*prim_discard_compute_cs;
+	unsigned			compute_gds_offset;
+	struct si_shader		*compute_ib_last_shader;
+	uint32_t			compute_rewind_va;
+	unsigned			compute_num_prims_in_batch;
+	bool				preserve_prim_restart_gds_at_flush;
+	/* index_ring is divided into 2 halves for doublebuffering. */
+	struct si_resource		*index_ring;
+	unsigned			index_ring_base; /* offset of a per-IB portion */
+	unsigned			index_ring_offset; /* offset within a per-IB portion */
+	unsigned			index_ring_size_per_ib; /* max available size per IB */
+	bool				prim_discard_compute_ib_initialized;
+	/* For tracking the last execution barrier - it can be either
+	 * a WRITE_DATA packet or a fence. */
+	uint32_t			*last_pkt3_write_data;
+	struct si_resource		*barrier_buf;
+	unsigned			barrier_buf_offset;
+	struct pipe_fence_handle	*last_ib_barrier_fence;
+	struct si_resource		*last_ib_barrier_buf;
+	unsigned			last_ib_barrier_buf_offset;
+
 	/* Atoms (direct states). */
 	union si_state_atoms		atoms;
 	unsigned			dirty_atoms; /* mask */
@@ -895,6 +928,7 @@ struct si_context {
 	struct si_shader_ctx_state	vs_shader;
 	struct si_shader_ctx_state	tcs_shader;
 	struct si_shader_ctx_state	tes_shader;
+	struct si_shader_ctx_state	cs_prim_discard_state;
 	struct si_cs_shader_state	cs_shader_state;
 
 	/* shader information */
@@ -963,6 +997,7 @@ struct si_context {
 	/* Emitted draw state. */
 	bool			gs_tri_strip_adj_fix:1;
 	bool			ls_vgpr_fix:1;
+	bool			prim_discard_cs_instancing:1;
 	int			last_index_size;
 	int			last_base_vertex;
 	int			last_start_instance;
@@ -1076,6 +1111,7 @@ struct si_context {
 	/* Maintain the list of active queries for pausing between IBs. */
 	int				num_occlusion_queries;
 	int				num_perfect_occlusion_queries;
+	int				num_pipeline_stat_queries;
 	struct list_head		active_queries;
 	unsigned			num_cs_dw_queries_suspend;
 
@@ -1311,6 +1347,26 @@ unsigned si_get_compute_resource_limits(struct si_screen *sscreen,
 					unsigned threadgroups_per_cu);
 void si_init_compute_functions(struct si_context *sctx);
 
+/* si_compute_prim_discard.c */
+enum si_prim_discard_outcome {
+	SI_PRIM_DISCARD_ENABLED,
+	SI_PRIM_DISCARD_DISABLED,
+	SI_PRIM_DISCARD_DRAW_SPLIT,
+};
+
+void si_build_prim_discard_compute_shader(struct si_shader_context *ctx);
+enum si_prim_discard_outcome
+si_prepare_prim_discard_or_split_draw(struct si_context *sctx,
+				      const struct pipe_draw_info *info);
+void si_compute_signal_gfx(struct si_context *sctx);
+void si_dispatch_prim_discard_cs_and_draw(struct si_context *sctx,
+					  const struct pipe_draw_info *info,
+					  unsigned index_size,
+					  unsigned base_vertex,
+					  uint64_t input_indexbuf_va,
+					  unsigned input_indexbuf_max_elements);
+void si_initialize_prim_discard_tunables(struct si_context *sctx);
+
 /* si_perfcounters.c */
 void si_init_perfcounters(struct si_screen *screen);
 void si_destroy_perfcounters(struct si_screen *screen);
@@ -1748,6 +1804,11 @@ radeon_add_to_gfx_buffer_list_check_mem(struct si_context *sctx,
 	radeon_add_to_buffer_list(sctx, sctx->gfx_cs, bo, usage, priority);
 }
 
+static inline bool si_compute_prim_discard_enabled(struct si_context *sctx)
+{
+	return sctx->prim_discard_vertex_count_threshold != UINT_MAX;
+}
+
 #define PRINT_ERR(fmt, args...) \
 	fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)
 
diff --git a/src/gallium/drivers/radeonsi/si_query.c b/src/gallium/drivers/radeonsi/si_query.c
index 0e44d75..1dd9249 100644
--- a/src/gallium/drivers/radeonsi/si_query.c
+++ b/src/gallium/drivers/radeonsi/si_query.c
@@ -850,6 +850,9 @@ static void si_query_hw_emit_start(struct si_context *sctx,
 	si_update_occlusion_query_state(sctx, query->b.type, 1);
 	si_update_prims_generated_query_state(sctx, query->b.type, 1);
 
+	if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
+		sctx->num_pipeline_stat_queries++;
+
 	if (query->b.type != SI_QUERY_TIME_ELAPSED_SDMA)
 		si_need_gfx_cs_space(sctx);
 
@@ -954,6 +957,9 @@ static void si_query_hw_emit_stop(struct si_context *sctx,
 
 	si_update_occlusion_query_state(sctx, query->b.type, -1);
 	si_update_prims_generated_query_state(sctx, query->b.type, -1);
+
+	if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
+		sctx->num_pipeline_stat_queries--;
 }
 
 static void emit_set_predicate(struct si_context *ctx,
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index c31e94f..7260a6b 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -25,6 +25,7 @@
 #include "util/u_memory.h"
 #include "util/u_string.h"
 #include "tgsi/tgsi_build.h"
+#include "tgsi/tgsi_strings.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi/tgsi_dump.h"
 
@@ -3548,6 +3549,33 @@ static void si_llvm_emit_vs_epilogue(struct ac_shader_abi *abi,
 	FREE(outputs);
 }
 
+static void si_llvm_emit_prim_discard_cs_epilogue(struct ac_shader_abi *abi,
+						  unsigned max_outputs,
+						  LLVMValueRef *addrs)
+{
+	struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+	struct tgsi_shader_info *info = &ctx->shader->selector->info;
+	LLVMValueRef pos[4] = {};
+
+	assert(info->num_outputs <= max_outputs);
+
+	for (unsigned i = 0; i < info->num_outputs; i++) {
+		if (info->output_semantic_name[i] != TGSI_SEMANTIC_POSITION)
+			continue;
+
+		for (unsigned chan = 0; chan < 4; chan++)
+			pos[chan] = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], "");
+		break;
+	}
+	assert(pos[0] != NULL);
+
+	/* Return the position output. */
+	LLVMValueRef ret = ctx->return_value;
+	for (unsigned chan = 0; chan < 4; chan++)
+		ret = LLVMBuildInsertValue(ctx->ac.builder, ret, pos[chan], chan, "");
+	ctx->return_value = ret;
+}
+
 static void si_tgsi_emit_epilogue(struct lp_build_tgsi_context *bld_base)
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
@@ -4518,6 +4546,12 @@ static void create_function(struct si_shader_context *ctx)
 
 		/* VGPRs */
 		declare_vs_input_vgprs(ctx, &fninfo, &num_prolog_vgprs);
+
+		/* Return values */
+		if (shader->key.opt.vs_as_prim_discard_cs) {
+			for (i = 0; i < 4; i++)
+				returns[num_returns++] = ctx->f32; /* VGPRs */
+		}
 		break;
 
 	case PIPE_SHADER_TESS_CTRL: /* GFX6-GFX8 */
@@ -5317,6 +5351,8 @@ const char *si_get_shader_name(const struct si_shader *shader, unsigned processo
 			return "Vertex Shader as ES";
 		else if (shader->key.as_ls)
 			return "Vertex Shader as LS";
+		else if (shader->key.opt.vs_as_prim_discard_cs)
+			return "Vertex Shader as Primitive Discard CS";
 		else
 			return "Vertex Shader as VS";
 	case PIPE_SHADER_TESS_CTRL:
@@ -5699,6 +5735,28 @@ static void si_dump_shader_key(unsigned processor, const struct si_shader *shade
 		fprintf(f, "  as_ls = %u\n", key->as_ls);
 		fprintf(f, "  mono.u.vs_export_prim_id = %u\n",
 			key->mono.u.vs_export_prim_id);
+		fprintf(f, "  opt.vs_as_prim_discard_cs = %u\n",
+			key->opt.vs_as_prim_discard_cs);
+		fprintf(f, "  opt.cs_prim_type = %s\n",
+			tgsi_primitive_names[key->opt.cs_prim_type]);
+		fprintf(f, "  opt.cs_indexed = %u\n",
+			key->opt.cs_indexed);
+		fprintf(f, "  opt.cs_instancing = %u\n",
+			key->opt.cs_instancing);
+		fprintf(f, "  opt.cs_primitive_restart = %u\n",
+			key->opt.cs_primitive_restart);
+		fprintf(f, "  opt.cs_provoking_vertex_first = %u\n",
+			key->opt.cs_provoking_vertex_first);
+		fprintf(f, "  opt.cs_need_correct_orientation = %u\n",
+			key->opt.cs_need_correct_orientation);
+		fprintf(f, "  opt.cs_cull_front = %u\n",
+			key->opt.cs_cull_front);
+		fprintf(f, "  opt.cs_cull_back = %u\n",
+			key->opt.cs_cull_back);
+		fprintf(f, "  opt.cs_cull_z = %u\n",
+			key->opt.cs_cull_z);
+		fprintf(f, "  opt.cs_halfz_clip_space = %u\n",
+			key->opt.cs_halfz_clip_space);
 		break;
 
 	case PIPE_SHADER_TESS_CTRL:
@@ -5854,6 +5912,8 @@ static bool si_compile_tgsi_main(struct si_shader_context *ctx)
 			ctx->abi.emit_outputs = si_llvm_emit_ls_epilogue;
 		else if (shader->key.as_es)
 			ctx->abi.emit_outputs = si_llvm_emit_es_epilogue;
+		else if (shader->key.opt.vs_as_prim_discard_cs)
+			ctx->abi.emit_outputs = si_llvm_emit_prim_discard_cs_epilogue;
 		else
 			ctx->abi.emit_outputs = si_llvm_emit_vs_epilogue;
 		bld_base->emit_epilogue = si_tgsi_emit_epilogue;
@@ -6644,6 +6704,9 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
 
 		si_build_wrapper_function(&ctx, parts + !need_prolog,
 					  1 + need_prolog, need_prolog, 0);
+
+		if (ctx.shader->key.opt.vs_as_prim_discard_cs)
+			si_build_prim_discard_compute_shader(&ctx);
 	} else if (shader->is_monolithic && ctx.type == PIPE_SHADER_TESS_CTRL) {
 		if (sscreen->info.chip_class >= GFX9) {
 			struct si_shader_selector *ls = shader->key.part.tcs.ls;
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 3a63292..bc9299b 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -340,6 +340,7 @@ struct si_shader_selector {
 	unsigned	type;
 	bool		vs_needs_prolog;
 	bool		force_correct_derivs_after_kill;
+	bool		prim_discard_cs_allowed;
 	unsigned	pa_cl_vs_out_cntl;
 	ubyte		clipdist_mask;
 	ubyte		culldist_mask;
@@ -554,6 +555,19 @@ struct si_shader_key {
 		 * possible, because it's in the "opt" group.
 		 */
 		unsigned	prefer_mono:1;
+
+		/* Primitive discard compute shader. */
+		unsigned	vs_as_prim_discard_cs:1;
+		unsigned	cs_prim_type:4;
+		unsigned	cs_indexed:1;
+		unsigned	cs_instancing:1;
+		unsigned	cs_primitive_restart:1;
+		unsigned	cs_provoking_vertex_first:1;
+		unsigned	cs_need_correct_orientation:1;
+		unsigned	cs_cull_front:1;
+		unsigned	cs_cull_back:1;
+		unsigned	cs_cull_z:1;
+		unsigned	cs_halfz_clip_space:1;
 	} opt;
 };
 
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 7debbc1..f9e8adc 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -857,6 +857,15 @@ static void *si_create_rs_state(struct pipe_context *ctx,
 		return NULL;
 	}
 
+	if (!state->front_ccw) {
+		rs->cull_front = !!(state->cull_face & PIPE_FACE_FRONT);
+		rs->cull_back = !!(state->cull_face & PIPE_FACE_BACK);
+	} else {
+		rs->cull_back = !!(state->cull_face & PIPE_FACE_FRONT);
+		rs->cull_front = !!(state->cull_face & PIPE_FACE_BACK);
+	}
+	rs->depth_clamp_any = !state->depth_clip_near || !state->depth_clip_far;
+	rs->provoking_vertex_first = state->flatshade_first;
 	rs->scissor_enable = state->scissor;
 	rs->clip_halfz = state->clip_halfz;
 	rs->two_side = state->light_twoside;
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 613aa32..05e974d 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -87,6 +87,10 @@ struct si_state_rasterizer {
 	unsigned		rasterizer_discard:1;
 	unsigned		scissor_enable:1;
 	unsigned		clip_halfz:1;
+	unsigned		cull_front:1;
+	unsigned		cull_back:1;
+	unsigned		depth_clamp_any:1;
+	unsigned		provoking_vertex_first:1;
 };
 
 struct si_dsa_stencil_ref_part {
@@ -600,6 +604,7 @@ void si_shader_selector_key_vs(struct si_context *sctx,
 			       struct si_vs_prolog_bits *prolog_key);
 
 /* si_state_draw.c */
+void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx);
 void si_emit_cache_flush(struct si_context *sctx);
 void si_trace_emit(struct si_context *sctx);
 void si_init_draw_functions(struct si_context *sctx);
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index ad1cf24..b4b3fe3 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -29,6 +29,7 @@
 #include "util/u_log.h"
 #include "util/u_upload_mgr.h"
 #include "util/u_prim.h"
+#include "util/u_suballoc.h"
 
 #include "ac_debug.h"
 
@@ -676,7 +677,9 @@ static void si_emit_draw_packets(struct si_context *sctx,
 				 struct pipe_resource *indexbuf,
 				 unsigned index_size,
 				 unsigned index_offset,
-				 unsigned instance_count)
+				 unsigned instance_count,
+				 bool dispatch_prim_discard_cs,
+				 unsigned original_index_size)
 {
 	struct pipe_draw_indirect_info *indirect = info->indirect;
 	struct radeon_cmdbuf *cs = sctx->gfx_cs;
@@ -735,13 +738,15 @@ static void si_emit_draw_packets(struct si_context *sctx,
 			sctx->last_index_size = index_size;
 		}
 
-		index_max_size = (indexbuf->width0 - index_offset) /
-				  index_size;
-		index_va = si_resource(indexbuf)->gpu_address + index_offset;
+		if (original_index_size) {
+			index_max_size = (indexbuf->width0 - index_offset) /
+					  original_index_size;
+			index_va = si_resource(indexbuf)->gpu_address + index_offset;
 
-		radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
-				      si_resource(indexbuf),
-				      RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER);
+			radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
+					      si_resource(indexbuf),
+					      RADEON_USAGE_READ, RADEON_PRIO_INDEX_BUFFER);
+		}
 	} else {
 		/* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
 		 * so the state must be re-emitted before the next indexed draw.
@@ -828,7 +833,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
 		}
 
 		/* Base vertex and start instance. */
-		base_vertex = index_size ? info->index_bias : info->start;
+		base_vertex = original_index_size ? info->index_bias : info->start;
 
 		if (sctx->num_vs_blit_sgprs) {
 			/* Re-emit draw constants after we leave u_blitter. */
@@ -856,6 +861,17 @@ static void si_emit_draw_packets(struct si_context *sctx,
 		}
 
 		if (index_size) {
+			if (dispatch_prim_discard_cs) {
+				index_va += info->start * original_index_size;
+				index_max_size = MIN2(index_max_size, info->count);
+
+				si_dispatch_prim_discard_cs_and_draw(sctx, info,
+								     original_index_size,
+								     base_vertex,
+								     index_va, index_max_size);
+				return;
+			}
+
 			index_va += info->start * index_size;
 
 			radeon_emit(cs, PKT3(PKT3_DRAW_INDEX_2, 4, render_cond_bit));
@@ -902,6 +918,33 @@ static void si_emit_surface_sync(struct si_context *sctx,
 		sctx->context_roll = true;
 }
 
+void si_prim_discard_signal_next_compute_ib_start(struct si_context *sctx)
+{
+	if (!si_compute_prim_discard_enabled(sctx))
+		return;
+
+	if (!sctx->barrier_buf) {
+		u_suballocator_alloc(sctx->allocator_zeroed_memory, 4, 4,
+				     &sctx->barrier_buf_offset,
+				     (struct pipe_resource**)&sctx->barrier_buf);
+	}
+
+	/* Emit a placeholder to signal the next compute IB to start.
+	 * See si_compute_prim_discard.c for explanation.
+	 */
+	uint32_t signal = 1;
+	si_cp_write_data(sctx, sctx->barrier_buf, sctx->barrier_buf_offset,
+			 4, V_370_MEM, V_370_ME, &signal);
+
+	sctx->last_pkt3_write_data =
+			&sctx->gfx_cs->current.buf[sctx->gfx_cs->current.cdw - 5];
+
+	/* Only the last occurence of WRITE_DATA will be executed.
+	 * The packet will be enabled in si_flush_gfx_cs.
+	 */
+	*sctx->last_pkt3_write_data = PKT3(PKT3_NOP, 3, 0);
+}
+
 void si_emit_cache_flush(struct si_context *sctx)
 {
 	struct radeon_cmdbuf *cs = sctx->gfx_cs;
@@ -919,8 +962,18 @@ void si_emit_cache_flush(struct si_context *sctx)
 	}
 
 	uint32_t cp_coher_cntl = 0;
-	uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
-					SI_CONTEXT_FLUSH_AND_INV_DB);
+	const uint32_t flush_cb_db = flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
+					      SI_CONTEXT_FLUSH_AND_INV_DB);
+	const bool is_barrier = flush_cb_db ||
+				/* INV_ICACHE == beginning of gfx IB. Checking
+				 * INV_ICACHE fixes corruption for DeusExMD with
+				 * compute-based culling, but I don't know why.
+				 */
+				flags & (SI_CONTEXT_INV_ICACHE |
+					 SI_CONTEXT_PS_PARTIAL_FLUSH |
+					 SI_CONTEXT_VS_PARTIAL_FLUSH) ||
+				(flags & SI_CONTEXT_CS_PARTIAL_FLUSH &&
+				 sctx->compute_is_busy);
 
 	if (flags & SI_CONTEXT_FLUSH_AND_INV_CB)
 		sctx->num_cb_cache_flushes++;
@@ -1144,6 +1197,9 @@ void si_emit_cache_flush(struct si_context *sctx)
 	if (cp_coher_cntl)
 		si_emit_surface_sync(sctx, cp_coher_cntl);
 
+	if (is_barrier)
+		si_prim_discard_signal_next_compute_ib_start(sctx);
+
 	if (flags & SI_CONTEXT_START_PIPELINE_STATS) {
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
 		radeon_emit(cs, EVENT_TYPE(V_028A90_PIPELINESTAT_START) |
@@ -1260,6 +1316,94 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
 			       primitive_restart);
 }
 
+static bool
+si_all_vs_resources_read_only(struct si_context *sctx,
+			      struct pipe_resource *indexbuf)
+{
+	struct radeon_winsys *ws = sctx->ws;
+	struct radeon_cmdbuf *cs = sctx->gfx_cs;
+
+	/* Index buffer. */
+	if (indexbuf &&
+	    ws->cs_is_buffer_referenced(cs, si_resource(indexbuf)->buf,
+					RADEON_USAGE_WRITE))
+		return false;
+
+	/* Vertex buffers. */
+	struct si_vertex_elements *velems = sctx->vertex_elements;
+	unsigned num_velems = velems->count;
+
+	for (unsigned i = 0; i < num_velems; i++) {
+		if (!((1 << i) & velems->first_vb_use_mask))
+			continue;
+
+		unsigned vb_index = velems->vertex_buffer_index[i];
+		struct pipe_resource *res = sctx->vertex_buffer[vb_index].buffer.resource;
+		if (!res)
+			continue;
+
+		if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf,
+						RADEON_USAGE_WRITE))
+			return false;
+	}
+
+	/* Constant and shader buffers. */
+	struct si_descriptors *buffers =
+		&sctx->descriptors[si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_VERTEX)];
+	for (unsigned i = 0; i < buffers->num_active_slots; i++) {
+		unsigned index = buffers->first_active_slot + i;
+		struct pipe_resource *res =
+			sctx->const_and_shader_buffers[PIPE_SHADER_VERTEX].buffers[index];
+		if (!res)
+			continue;
+
+		if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf,
+						RADEON_USAGE_WRITE))
+			return false;
+	}
+
+	/* Samplers. */
+	struct si_shader_selector *vs = sctx->vs_shader.cso;
+	if (vs->info.samplers_declared) {
+		unsigned num_samplers = util_last_bit(vs->info.samplers_declared);
+
+		for (unsigned i = 0; i < num_samplers; i++) {
+			struct pipe_sampler_view *view = sctx->samplers[PIPE_SHADER_VERTEX].views[i];
+			if (!view)
+				continue;
+
+			if (ws->cs_is_buffer_referenced(cs,
+							si_resource(view->texture)->buf,
+							RADEON_USAGE_WRITE))
+				return false;
+		}
+	}
+
+	/* Images. */
+	if (vs->info.images_declared) {
+		unsigned num_images = util_last_bit(vs->info.images_declared);
+
+		for (unsigned i = 0; i < num_images; i++) {
+			struct pipe_resource *res = sctx->images[PIPE_SHADER_VERTEX].views[i].resource;
+			if (!res)
+				continue;
+
+			if (ws->cs_is_buffer_referenced(cs, si_resource(res)->buf,
+							RADEON_USAGE_WRITE))
+				return false;
+		}
+	}
+
+	return true;
+}
+
+static ALWAYS_INLINE bool pd_msg(const char *s)
+{
+	if (SI_PRIM_DISCARD_DEBUG)
+		printf("PD failed: %s\n", s);
+	return false;
+}
+
 static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
@@ -1370,9 +1514,6 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
 		}
 	}
 
-	if (sctx->do_update_shaders && !si_update_shaders(sctx))
-		goto return_cleanup;
-
 	if (index_size) {
 		/* Translate or upload, if needed. */
 		/* 8-bit indices are supported on GFX8. */
@@ -1425,6 +1566,11 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
 		}
 	}
 
+	bool dispatch_prim_discard_cs = false;
+	bool prim_discard_cs_instancing = false;
+	unsigned original_index_size = index_size;
+	unsigned direct_count = 0;
+
 	if (info->indirect) {
 		struct pipe_draw_indirect_info *indirect = info->indirect;
 
@@ -1444,8 +1590,80 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
 				si_resource(indirect->indirect_draw_count)->TC_L2_dirty = false;
 			}
 		}
+	} else {
+		direct_count = info->count * instance_count;
+	}
+
+	/* Determine if we can use the primitive discard compute shader. */
+	if (si_compute_prim_discard_enabled(sctx) &&
+	    /* Multiply by 3 for strips and fans to get the vertex count as triangles. */
+	    direct_count * (prim == PIPE_PRIM_TRIANGLES ? 1 : 3) >
+	    sctx->prim_discard_vertex_count_threshold &&
+	    (!info->count_from_stream_output || pd_msg("draw_opaque")) &&
+	    (primitive_restart ?
+	     /* Supported prim types with primitive restart: */
+	     (prim == PIPE_PRIM_TRIANGLE_STRIP || pd_msg("bad prim type with primitive restart")) &&
+	     /* Disallow instancing with primitive restart: */
+	     (instance_count == 1 || pd_msg("instance_count > 1 with primitive restart")) :
+	     /* Supported prim types without primitive restart + allow instancing: */
+	     (1 << prim) & ((1 << PIPE_PRIM_TRIANGLES) |
+			    (1 << PIPE_PRIM_TRIANGLE_STRIP) |
+			    (1 << PIPE_PRIM_TRIANGLE_FAN)) &&
+	     /* Instancing is limited to 16-bit indices, because InstanceID is packed into VertexID. */
+	     /* TODO: DrawArraysInstanced doesn't sometimes work, so it's disabled. */
+	     (instance_count == 1 ||
+	      (instance_count <= USHRT_MAX && index_size && index_size <= 2) ||
+	      pd_msg("instance_count too large or index_size == 4 or DrawArraysInstanced"))) &&
+	    (info->drawid == 0 || !sctx->vs_shader.cso->info.uses_drawid || pd_msg("draw_id > 0")) &&
+	    (!sctx->render_cond || pd_msg("render condition")) &&
+	    /* Forced enablement ignores pipeline statistics queries. */
+	    (sctx->screen->debug_flags & (DBG(PD) | DBG(ALWAYS_PD)) ||
+	     (!sctx->num_pipeline_stat_queries && !sctx->streamout.prims_gen_query_enabled) ||
+	     pd_msg("pipestat or primgen query")) &&
+	    (!sctx->vertex_elements->instance_divisor_is_fetched || pd_msg("loads instance divisors")) &&
+	    (!sctx->tes_shader.cso || pd_msg("uses tess")) &&
+	    (!sctx->gs_shader.cso || pd_msg("uses GS")) &&
+	    (!sctx->ps_shader.cso->info.uses_primid || pd_msg("PS uses PrimID")) &&
+#if SI_PRIM_DISCARD_DEBUG /* same as cso->prim_discard_cs_allowed */
+	    (!sctx->vs_shader.cso->info.uses_bindless_images || pd_msg("uses bindless images")) &&
+	    (!sctx->vs_shader.cso->info.uses_bindless_samplers || pd_msg("uses bindless samplers")) &&
+	    (!sctx->vs_shader.cso->info.writes_memory || pd_msg("writes memory")) &&
+	    (!sctx->vs_shader.cso->info.writes_viewport_index || pd_msg("writes viewport index")) &&
+	    !sctx->vs_shader.cso->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] &&
+	    !sctx->vs_shader.cso->so.num_outputs &&
+#else
+	    (sctx->vs_shader.cso->prim_discard_cs_allowed || pd_msg("VS shader uses unsupported features")) &&
+#endif
+	    /* Check that all buffers are used for read only, because compute
+	     * dispatches can run ahead. */
+	    (si_all_vs_resources_read_only(sctx, index_size ? indexbuf : NULL) || pd_msg("write reference"))) {
+		switch (si_prepare_prim_discard_or_split_draw(sctx, info)) {
+		case SI_PRIM_DISCARD_ENABLED:
+			original_index_size = index_size;
+			prim_discard_cs_instancing = instance_count > 1;
+			dispatch_prim_discard_cs = true;
+
+			/* The compute shader changes/lowers the following: */
+			prim = PIPE_PRIM_TRIANGLES;
+			index_size = 4;
+			instance_count = 1;
+			primitive_restart = false;
+			break;
+		case SI_PRIM_DISCARD_DISABLED:
+			break;
+		case SI_PRIM_DISCARD_DRAW_SPLIT:
+			goto return_cleanup;
+		}
 	}
 
+	if (prim_discard_cs_instancing != sctx->prim_discard_cs_instancing) {
+		sctx->prim_discard_cs_instancing = prim_discard_cs_instancing;
+		sctx->do_update_shaders = true;
+	}
+
+	if (sctx->do_update_shaders && !si_update_shaders(sctx))
+		goto return_cleanup;
+
 	si_need_gfx_cs_space(sctx);
 
 	if (sctx->bo_list_add_all_gfx_resources)
@@ -1507,7 +1725,8 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
 		sctx->dirty_atoms = 0;
 
 		si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset,
-				     instance_count);
+				     instance_count, dispatch_prim_discard_cs,
+				     original_index_size);
 		/* <-- CUs are busy here. */
 
 		/* Start prefetches after the draw has been started. Both will run
@@ -1527,7 +1746,7 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
 			cik_emit_prefetch_L2(sctx, true);
 
 		if (!si_upload_graphics_shader_descriptors(sctx))
-			return;
+			goto return_cleanup;
 
 		si_emit_all_states(sctx, info, prim, instance_count,
 				   primitive_restart, masked_atoms);
@@ -1540,7 +1759,8 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
 		sctx->dirty_atoms = 0;
 
 		si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset,
-				     instance_count);
+				     instance_count, dispatch_prim_discard_cs,
+				     original_index_size);
 
 		/* Prefetch the remaining shaders after the draw has been
 		 * started. */
diff --git a/src/gallium/drivers/radeonsi/si_state_msaa.c b/src/gallium/drivers/radeonsi/si_state_msaa.c
index e6d97fe..0fa3891 100644
--- a/src/gallium/drivers/radeonsi/si_state_msaa.c
+++ b/src/gallium/drivers/radeonsi/si_state_msaa.c
@@ -82,6 +82,10 @@
  *   Right half: {1,3,5,7,9,11,13,15}
  */
 
+/* Important note: We have to use the standard DX positions, because
+ * the primitive discard compute shader relies on them.
+ */
+
 /* 1x MSAA */
 static const uint32_t sample_locs_1x =
 	FILL_SREG( 0, 0,   0, 0,   0, 0,   0, 0); /* S1, S2, S3 fields are not used by 1x */
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 28360aa..628844d 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1383,6 +1383,8 @@ void si_shader_selector_key_vs(struct si_context *sctx,
 
 	prolog_key->instance_divisor_is_one = elts->instance_divisor_is_one;
 	prolog_key->instance_divisor_is_fetched = elts->instance_divisor_is_fetched;
+	prolog_key->unpack_instance_id_from_vertex_id =
+		sctx->prim_discard_cs_instancing;
 
 	/* Prefer a monolithic shader to allow scheduling divisions around
 	 * VBO loads. */
@@ -1910,8 +1912,11 @@ current_not_ready:
 
 	/* Compile the main shader part if it doesn't exist. This can happen
 	 * if the initial guess was wrong.
+	 *
+	 * The prim discard CS doesn't need the main shader part.
 	 */
-	if (!is_pure_monolithic) {
+	if (!is_pure_monolithic &&
+	    !key->opt.vs_as_prim_discard_cs) {
 		bool ok;
 
 		/* Make sure the main shader part is present. This is needed
@@ -1962,9 +1967,10 @@ current_not_ready:
 		is_pure_monolithic ||
 		memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
 
+	/* The prim discard CS is always optimized. */
 	shader->is_optimized =
-		!is_pure_monolithic &&
-		memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
+		(!is_pure_monolithic || key->opt.vs_as_prim_discard_cs) &&
+		 memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
 
 	/* If it's an optimized shader, compile it asynchronously. */
 	if (shader->is_optimized && thread_index < 0) {
@@ -2312,6 +2318,15 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 		sel->info.uses_kill &&
 		sctx->screen->debug_flags & DBG(FS_CORRECT_DERIVS_AFTER_KILL);
 
+	sel->prim_discard_cs_allowed =
+		sel->type == PIPE_SHADER_VERTEX &&
+		!sel->info.uses_bindless_images &&
+		!sel->info.uses_bindless_samplers &&
+		!sel->info.writes_memory &&
+		!sel->info.writes_viewport_index &&
+		!sel->info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION] &&
+		!sel->so.num_outputs;
+
 	/* Set which opcode uses which (i,j) pair. */
 	if (sel->info.uses_persp_opcode_interp_centroid)
 		sel->info.uses_persp_centroid = true;
diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c
index a144d7b..39c8536 100644
--- a/src/gallium/drivers/radeonsi/si_state_viewport.c
+++ b/src/gallium/drivers/radeonsi/si_state_viewport.c
@@ -381,6 +381,12 @@ static void si_set_viewport_states(struct pipe_context *pctx,
 			scissor->quant_mode = SI_QUANT_MODE_16_8_FIXED_POINT_1_256TH;
 	}
 
+	if (start_slot == 0) {
+		ctx->viewports.y_inverted =
+			-state->scale[1] + state->translate[1] >
+			state->scale[1] + state->translate[1];
+	}
+
 	si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports);
 	si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband);
 	si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors);
-- 
2.7.4