2 * Copyright © 2017 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 * DEALINGS IN THE SOFTWARE.
24 * @file crocus_state.c
26 * ============================= GENXML CODE =============================
27 * [This file is compiled once per generation.]
28 * =======================================================================
30 * This is the main state upload code.
32 * Gallium uses Constant State Objects, or CSOs, for most state. Large,
33 * complex, or highly reusable state can be created once, and bound and
34 * rebound multiple times. This is modeled with the pipe->create_*_state()
35 * and pipe->bind_*_state() hooks. Highly dynamic or inexpensive state is
36 * streamed out on the fly, via pipe->set_*_state() hooks.
38 * OpenGL involves frequently mutating context state, which is mirrored in
39 * core Mesa by highly mutable data structures. However, most applications
40 * typically draw the same things over and over - from frame to frame, most
41 * of the same objects are still visible and need to be redrawn. So, rather
42 * than inventing new state all the time, applications usually mutate to swap
43 * between known states that we've seen before.
45 * Gallium isolates us from this mutation by tracking API state, and
46 * distilling it into a set of Constant State Objects, or CSOs. Large,
47 * complex, or typically reusable state can be created once, then reused
48 * multiple times. Drivers can create and store their own associated data.
49 * This create/bind model corresponds to the pipe->create_*_state() and
50 * pipe->bind_*_state() driver hooks.
52 * Some state is cheap to create, or expected to be highly dynamic. Rather
53 * than creating and caching piles of CSOs for these, Gallium simply streams
54 * them out, via the pipe->set_*_state() driver hooks.
56 * To reduce draw time overhead, we try to compute as much state at create
57 * time as possible. Wherever possible, we translate the Gallium pipe state
58 * to 3DSTATE commands, and store those commands in the CSO. At draw time,
59 * we can simply memcpy them into a batch buffer.
61 * No hardware matches the abstraction perfectly, so some commands require
62 * information from multiple CSOs. In this case, we can store two copies
63 * of the packet (one in each CSO), and simply | together their DWords at
64 * draw time. Sometimes the second set is trivial (one or two fields), so
65 * we simply pack it at draw time.
67 * There are two main components in the file below. First, the CSO hooks
68 * create/bind/track state. The second are the draw-time upload functions,
69 * crocus_upload_render_state() and crocus_upload_compute_state(), which read
70 * the context state and emit the commands into the actual batch.
84 #include "drm-uapi/i915_drm.h"
85 #include "intel/common/intel_l3_config.h"
86 #include "intel/common/intel_sample_positions.h"
87 #include "intel/compiler/brw_compiler.h"
88 #include "compiler/shader_info.h"
89 #include "pipe/p_context.h"
90 #include "pipe/p_defines.h"
91 #include "pipe/p_screen.h"
92 #include "pipe/p_state.h"
93 #include "util/format/u_format.h"
94 #include "util/half_float.h"
95 #include "util/u_dual_blend.h"
96 #include "util/u_framebuffer.h"
97 #include "util/u_helpers.h"
98 #include "util/u_inlines.h"
99 #include "util/u_memory.h"
100 #include "util/u_prim.h"
101 #include "util/u_transfer.h"
102 #include "util/u_upload_mgr.h"
103 #include "util/u_viewport.h"
104 #include "crocus_batch.h"
105 #include "crocus_context.h"
106 #include "crocus_defines.h"
107 #include "crocus_pipe.h"
108 #include "crocus_resource.h"
110 #include "crocus_genx_macros.h"
111 #include "intel/common/intel_guardband.h"
112 #include "main/macros.h" /* UNCLAMPED_* */
115 * Statically assert that PIPE_* enums match the hardware packets.
116 * (As long as they match, we don't need to translate them.)
118 UNUSED static void pipe_asserts()
120 #define PIPE_ASSERT(x) STATIC_ASSERT((int)x)
122 /* pipe_logicop happens to match the hardware. */
123 PIPE_ASSERT(PIPE_LOGICOP_CLEAR == LOGICOP_CLEAR);
124 PIPE_ASSERT(PIPE_LOGICOP_NOR == LOGICOP_NOR);
125 PIPE_ASSERT(PIPE_LOGICOP_AND_INVERTED == LOGICOP_AND_INVERTED);
126 PIPE_ASSERT(PIPE_LOGICOP_COPY_INVERTED == LOGICOP_COPY_INVERTED);
127 PIPE_ASSERT(PIPE_LOGICOP_AND_REVERSE == LOGICOP_AND_REVERSE);
128 PIPE_ASSERT(PIPE_LOGICOP_INVERT == LOGICOP_INVERT);
129 PIPE_ASSERT(PIPE_LOGICOP_XOR == LOGICOP_XOR);
130 PIPE_ASSERT(PIPE_LOGICOP_NAND == LOGICOP_NAND);
131 PIPE_ASSERT(PIPE_LOGICOP_AND == LOGICOP_AND);
132 PIPE_ASSERT(PIPE_LOGICOP_EQUIV == LOGICOP_EQUIV);
133 PIPE_ASSERT(PIPE_LOGICOP_NOOP == LOGICOP_NOOP);
134 PIPE_ASSERT(PIPE_LOGICOP_OR_INVERTED == LOGICOP_OR_INVERTED);
135 PIPE_ASSERT(PIPE_LOGICOP_COPY == LOGICOP_COPY);
136 PIPE_ASSERT(PIPE_LOGICOP_OR_REVERSE == LOGICOP_OR_REVERSE);
137 PIPE_ASSERT(PIPE_LOGICOP_OR == LOGICOP_OR);
138 PIPE_ASSERT(PIPE_LOGICOP_SET == LOGICOP_SET);
140 /* pipe_blend_func happens to match the hardware. */
141 PIPE_ASSERT(PIPE_BLENDFACTOR_ONE == BLENDFACTOR_ONE);
142 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_COLOR == BLENDFACTOR_SRC_COLOR);
143 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA == BLENDFACTOR_SRC_ALPHA);
144 PIPE_ASSERT(PIPE_BLENDFACTOR_DST_ALPHA == BLENDFACTOR_DST_ALPHA);
145 PIPE_ASSERT(PIPE_BLENDFACTOR_DST_COLOR == BLENDFACTOR_DST_COLOR);
146 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE == BLENDFACTOR_SRC_ALPHA_SATURATE);
147 PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_COLOR == BLENDFACTOR_CONST_COLOR);
148 PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_ALPHA == BLENDFACTOR_CONST_ALPHA);
149 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_COLOR == BLENDFACTOR_SRC1_COLOR);
150 PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_ALPHA == BLENDFACTOR_SRC1_ALPHA);
151 PIPE_ASSERT(PIPE_BLENDFACTOR_ZERO == BLENDFACTOR_ZERO);
152 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_COLOR == BLENDFACTOR_INV_SRC_COLOR);
153 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_ALPHA == BLENDFACTOR_INV_SRC_ALPHA);
154 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_ALPHA == BLENDFACTOR_INV_DST_ALPHA);
155 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_COLOR == BLENDFACTOR_INV_DST_COLOR);
156 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_COLOR == BLENDFACTOR_INV_CONST_COLOR);
157 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_ALPHA == BLENDFACTOR_INV_CONST_ALPHA);
158 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_COLOR == BLENDFACTOR_INV_SRC1_COLOR);
159 PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_ALPHA == BLENDFACTOR_INV_SRC1_ALPHA);
161 /* pipe_blend_func happens to match the hardware. */
162 PIPE_ASSERT(PIPE_BLEND_ADD == BLENDFUNCTION_ADD);
163 PIPE_ASSERT(PIPE_BLEND_SUBTRACT == BLENDFUNCTION_SUBTRACT);
164 PIPE_ASSERT(PIPE_BLEND_REVERSE_SUBTRACT == BLENDFUNCTION_REVERSE_SUBTRACT);
165 PIPE_ASSERT(PIPE_BLEND_MIN == BLENDFUNCTION_MIN);
166 PIPE_ASSERT(PIPE_BLEND_MAX == BLENDFUNCTION_MAX);
168 /* pipe_stencil_op happens to match the hardware. */
169 PIPE_ASSERT(PIPE_STENCIL_OP_KEEP == STENCILOP_KEEP);
170 PIPE_ASSERT(PIPE_STENCIL_OP_ZERO == STENCILOP_ZERO);
171 PIPE_ASSERT(PIPE_STENCIL_OP_REPLACE == STENCILOP_REPLACE);
172 PIPE_ASSERT(PIPE_STENCIL_OP_INCR == STENCILOP_INCRSAT);
173 PIPE_ASSERT(PIPE_STENCIL_OP_DECR == STENCILOP_DECRSAT);
174 PIPE_ASSERT(PIPE_STENCIL_OP_INCR_WRAP == STENCILOP_INCR);
175 PIPE_ASSERT(PIPE_STENCIL_OP_DECR_WRAP == STENCILOP_DECR);
176 PIPE_ASSERT(PIPE_STENCIL_OP_INVERT == STENCILOP_INVERT);
179 /* pipe_sprite_coord_mode happens to match 3DSTATE_SBE */
180 PIPE_ASSERT(PIPE_SPRITE_COORD_UPPER_LEFT == UPPERLEFT);
181 PIPE_ASSERT(PIPE_SPRITE_COORD_LOWER_LEFT == LOWERLEFT);
187 translate_prim_type(enum pipe_prim_type prim, uint8_t verts_per_patch)
189 static const unsigned map[] = {
190 [PIPE_PRIM_POINTS] = _3DPRIM_POINTLIST,
191 [PIPE_PRIM_LINES] = _3DPRIM_LINELIST,
192 [PIPE_PRIM_LINE_LOOP] = _3DPRIM_LINELOOP,
193 [PIPE_PRIM_LINE_STRIP] = _3DPRIM_LINESTRIP,
194 [PIPE_PRIM_TRIANGLES] = _3DPRIM_TRILIST,
195 [PIPE_PRIM_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
196 [PIPE_PRIM_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
197 [PIPE_PRIM_QUADS] = _3DPRIM_QUADLIST,
198 [PIPE_PRIM_QUAD_STRIP] = _3DPRIM_QUADSTRIP,
199 [PIPE_PRIM_POLYGON] = _3DPRIM_POLYGON,
201 [PIPE_PRIM_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
202 [PIPE_PRIM_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
203 [PIPE_PRIM_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
204 [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
207 [PIPE_PRIM_PATCHES] = _3DPRIM_PATCHLIST_1 - 1,
211 return map[prim] + (prim == PIPE_PRIM_PATCHES ? verts_per_patch : 0);
215 translate_compare_func(enum pipe_compare_func pipe_func)
217 static const unsigned map[] = {
218 [PIPE_FUNC_NEVER] = COMPAREFUNCTION_NEVER,
219 [PIPE_FUNC_LESS] = COMPAREFUNCTION_LESS,
220 [PIPE_FUNC_EQUAL] = COMPAREFUNCTION_EQUAL,
221 [PIPE_FUNC_LEQUAL] = COMPAREFUNCTION_LEQUAL,
222 [PIPE_FUNC_GREATER] = COMPAREFUNCTION_GREATER,
223 [PIPE_FUNC_NOTEQUAL] = COMPAREFUNCTION_NOTEQUAL,
224 [PIPE_FUNC_GEQUAL] = COMPAREFUNCTION_GEQUAL,
225 [PIPE_FUNC_ALWAYS] = COMPAREFUNCTION_ALWAYS,
227 return map[pipe_func];
231 translate_shadow_func(enum pipe_compare_func pipe_func)
233 /* Gallium specifies the result of shadow comparisons as:
235 * 1 if ref <op> texel,
240 * 0 if texel <op> ref,
243 * So we need to flip the operator and also negate.
245 static const unsigned map[] = {
246 [PIPE_FUNC_NEVER] = PREFILTEROP_ALWAYS,
247 [PIPE_FUNC_LESS] = PREFILTEROP_LEQUAL,
248 [PIPE_FUNC_EQUAL] = PREFILTEROP_NOTEQUAL,
249 [PIPE_FUNC_LEQUAL] = PREFILTEROP_LESS,
250 [PIPE_FUNC_GREATER] = PREFILTEROP_GEQUAL,
251 [PIPE_FUNC_NOTEQUAL] = PREFILTEROP_EQUAL,
252 [PIPE_FUNC_GEQUAL] = PREFILTEROP_GREATER,
253 [PIPE_FUNC_ALWAYS] = PREFILTEROP_NEVER,
255 return map[pipe_func];
259 translate_cull_mode(unsigned pipe_face)
261 static const unsigned map[4] = {
262 [PIPE_FACE_NONE] = CULLMODE_NONE,
263 [PIPE_FACE_FRONT] = CULLMODE_FRONT,
264 [PIPE_FACE_BACK] = CULLMODE_BACK,
265 [PIPE_FACE_FRONT_AND_BACK] = CULLMODE_BOTH,
267 return map[pipe_face];
272 translate_fill_mode(unsigned pipe_polymode)
274 static const unsigned map[4] = {
275 [PIPE_POLYGON_MODE_FILL] = FILL_MODE_SOLID,
276 [PIPE_POLYGON_MODE_LINE] = FILL_MODE_WIREFRAME,
277 [PIPE_POLYGON_MODE_POINT] = FILL_MODE_POINT,
278 [PIPE_POLYGON_MODE_FILL_RECTANGLE] = FILL_MODE_SOLID,
280 return map[pipe_polymode];
285 translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)
287 static const unsigned map[] = {
288 [PIPE_TEX_MIPFILTER_NEAREST] = MIPFILTER_NEAREST,
289 [PIPE_TEX_MIPFILTER_LINEAR] = MIPFILTER_LINEAR,
290 [PIPE_TEX_MIPFILTER_NONE] = MIPFILTER_NONE,
292 return map[pipe_mip];
296 translate_wrap(unsigned pipe_wrap, bool either_nearest)
298 static const unsigned map[] = {
299 [PIPE_TEX_WRAP_REPEAT] = TCM_WRAP,
301 [PIPE_TEX_WRAP_CLAMP] = TCM_HALF_BORDER,
303 [PIPE_TEX_WRAP_CLAMP] = TCM_CLAMP_BORDER,
305 [PIPE_TEX_WRAP_CLAMP_TO_EDGE] = TCM_CLAMP,
306 [PIPE_TEX_WRAP_CLAMP_TO_BORDER] = TCM_CLAMP_BORDER,
307 [PIPE_TEX_WRAP_MIRROR_REPEAT] = TCM_MIRROR,
308 [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE] = TCM_MIRROR_ONCE,
310 /* These are unsupported. */
311 [PIPE_TEX_WRAP_MIRROR_CLAMP] = -1,
312 [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER] = -1,
315 if (pipe_wrap == PIPE_TEX_WRAP_CLAMP && either_nearest)
318 return map[pipe_wrap];
322 * Equiv if brw_state_batch
325 stream_state(struct crocus_batch *batch,
328 uint32_t *out_offset)
330 uint32_t offset = ALIGN(batch->state.used, alignment);
332 if (offset + size >= STATE_SZ && !batch->no_wrap) {
333 crocus_batch_flush(batch);
334 offset = ALIGN(batch->state.used, alignment);
335 } else if (offset + size >= batch->state.bo->size) {
336 const unsigned new_size =
337 MIN2(batch->state.bo->size + batch->state.bo->size / 2,
339 crocus_grow_buffer(batch, true, batch->state.used, new_size);
340 assert(offset + size < batch->state.bo->size);
343 crocus_record_state_size(batch->state_sizes, offset, size);
345 batch->state.used = offset + size;
346 *out_offset = offset;
348 return (uint32_t *)batch->state.map + (offset >> 2);
352 * stream_state() + memcpy.
355 emit_state(struct crocus_batch *batch, const void *data, unsigned size,
359 uint32_t *map = stream_state(batch, size, alignment, &offset);
362 memcpy(map, data, size);
369 upload_pipelined_state_pointers(struct crocus_batch *batch,
370 bool gs_active, uint32_t gs_offset,
371 uint32_t vs_offset, uint32_t sf_offset,
372 uint32_t clip_offset, uint32_t wm_offset, uint32_t cc_offset)
375 /* Need to flush before changing clip max threads for errata. */
376 crocus_emit_cmd(batch, GENX(MI_FLUSH), foo);
379 crocus_emit_cmd(batch, GENX(3DSTATE_PIPELINED_POINTERS), pp) {
380 pp.PointertoVSState = ro_bo(batch->state.bo, vs_offset);
381 pp.GSEnable = gs_active;
383 pp.PointertoGSState = ro_bo(batch->state.bo, gs_offset);
384 pp.ClipEnable = true;
385 pp.PointertoCLIPState = ro_bo(batch->state.bo, clip_offset);
386 pp.PointertoSFState = ro_bo(batch->state.bo, sf_offset);
387 pp.PointertoWMState = ro_bo(batch->state.bo, wm_offset);
388 pp.PointertoColorCalcState = ro_bo(batch->state.bo, cc_offset);
394 * Did field 'x' change between 'old_cso' and 'new_cso'?
396 * (If so, we may want to set some dirty flags.)
398 #define cso_changed(x) (!old_cso || (old_cso->x != new_cso->x))
399 #define cso_changed_memcmp(x) \
400 (!old_cso || memcmp(old_cso->x, new_cso->x, sizeof(old_cso->x)) != 0)
403 flush_before_state_base_change(struct crocus_batch *batch)
406 /* Flush before emitting STATE_BASE_ADDRESS.
408 * This isn't documented anywhere in the PRM. However, it seems to be
409 * necessary prior to changing the surface state base adress. We've
410 * seen issues in Vulkan where we get GPU hangs when using multi-level
411 * command buffers which clear depth, reset state base address, and then
414 * Normally, in GL, we would trust the kernel to do sufficient stalls
415 * and flushes prior to executing our batch. However, it doesn't seem
416 * as if the kernel's flushing is always sufficient and we don't want to
419 * We make this an end-of-pipe sync instead of a normal flush because we
420 * do not know the current status of the GPU. On Haswell at least,
421 * having a fast-clear operation in flight at the same time as a normal
422 * rendering operation can cause hangs. Since the kernel's flushing is
423 * insufficient, we need to ensure that any rendering operations from
424 * other processes are definitely complete before we try to do our own
425 * rendering. It's a bit of a big hammer but it appears to work.
427 const unsigned dc_flush =
428 GFX_VER >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
429 crocus_emit_end_of_pipe_sync(batch,
430 "change STATE_BASE_ADDRESS (flushes)",
431 PIPE_CONTROL_RENDER_TARGET_FLUSH |
433 PIPE_CONTROL_DEPTH_CACHE_FLUSH);
438 flush_after_state_base_change(struct crocus_batch *batch)
440 /* After re-setting the surface state base address, we have to do some
441 * cache flusing so that the sampler engine will pick up the new
442 * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
443 * Shared Function > 3D Sampler > State > State Caching (page 96):
445 * Coherency with system memory in the state cache, like the texture
446 * cache is handled partially by software. It is expected that the
447 * command stream or shader will issue Cache Flush operation or
448 * Cache_Flush sampler message to ensure that the L1 cache remains
449 * coherent with system memory.
453 * Whenever the value of the Dynamic_State_Base_Addr,
454 * Surface_State_Base_Addr are altered, the L1 state cache must be
455 * invalidated to ensure the new surface or sampler state is fetched
456 * from system memory.
458 * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
459 * which, according the PIPE_CONTROL instruction documentation in the
462 * Setting this bit is independent of any other bit in this packet.
463 * This bit controls the invalidation of the L1 and L2 state caches
464 * at the top of the pipe i.e. at the parsing time.
466 * Unfortunately, experimentation seems to indicate that state cache
467 * invalidation through a PIPE_CONTROL does nothing whatsoever in
468 * regards to surface state and binding tables. In stead, it seems that
469 * invalidating the texture cache is what is actually needed.
471 * XXX: As far as we have been able to determine through
472 * experimentation, shows that flush the texture cache appears to be
473 * sufficient. The theory here is that all of the sampling/rendering
474 * units cache the binding table in the texture cache. However, we have
475 * yet to be able to actually confirm this.
478 crocus_emit_end_of_pipe_sync(batch,
479 "change STATE_BASE_ADDRESS (invalidates)",
480 PIPE_CONTROL_INSTRUCTION_INVALIDATE |
481 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
482 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
483 PIPE_CONTROL_STATE_CACHE_INVALIDATE);
489 crocus_store_register_mem32(struct crocus_batch *batch, uint32_t reg,
490 struct crocus_bo *bo, uint32_t offset,
493 crocus_emit_cmd(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
494 srm.RegisterAddress = reg;
495 srm.MemoryAddress = ggtt_bo(bo, offset);
497 srm.PredicateEnable = predicated;
500 unreachable("unsupported predication");
506 crocus_store_register_mem64(struct crocus_batch *batch, uint32_t reg,
507 struct crocus_bo *bo, uint32_t offset,
510 crocus_store_register_mem32(batch, reg + 0, bo, offset + 0, predicated);
511 crocus_store_register_mem32(batch, reg + 4, bo, offset + 4, predicated);
517 _crocus_emit_lri(struct crocus_batch *batch, uint32_t reg, uint32_t val)
519 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
520 lri.RegisterOffset = reg;
524 #define crocus_emit_lri(b, r, v) _crocus_emit_lri(b, GENX(r##_num), v)
528 _crocus_emit_lrr(struct crocus_batch *batch, uint32_t dst, uint32_t src)
530 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
531 lrr.SourceRegisterAddress = src;
532 lrr.DestinationRegisterAddress = dst;
537 crocus_load_register_reg32(struct crocus_batch *batch, uint32_t dst,
540 _crocus_emit_lrr(batch, dst, src);
544 crocus_load_register_reg64(struct crocus_batch *batch, uint32_t dst,
547 _crocus_emit_lrr(batch, dst, src);
548 _crocus_emit_lrr(batch, dst + 4, src + 4);
553 crocus_load_register_imm32(struct crocus_batch *batch, uint32_t reg,
556 _crocus_emit_lri(batch, reg, val);
560 crocus_load_register_imm64(struct crocus_batch *batch, uint32_t reg,
563 _crocus_emit_lri(batch, reg + 0, val & 0xffffffff);
564 _crocus_emit_lri(batch, reg + 4, val >> 32);
568 * Emit MI_LOAD_REGISTER_MEM to load a 32-bit MMIO register from a buffer.
571 crocus_load_register_mem32(struct crocus_batch *batch, uint32_t reg,
572 struct crocus_bo *bo, uint32_t offset)
574 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
575 lrm.RegisterAddress = reg;
576 lrm.MemoryAddress = ro_bo(bo, offset);
581 * Load a 64-bit value from a buffer into a MMIO register via
582 * two MI_LOAD_REGISTER_MEM commands.
585 crocus_load_register_mem64(struct crocus_batch *batch, uint32_t reg,
586 struct crocus_bo *bo, uint32_t offset)
588 crocus_load_register_mem32(batch, reg + 0, bo, offset + 0);
589 crocus_load_register_mem32(batch, reg + 4, bo, offset + 4);
594 crocus_store_data_imm32(struct crocus_batch *batch,
595 struct crocus_bo *bo, uint32_t offset,
598 crocus_emit_cmd(batch, GENX(MI_STORE_DATA_IMM), sdi) {
599 sdi.Address = rw_bo(bo, offset);
601 sdi.ImmediateData = imm;
607 crocus_store_data_imm64(struct crocus_batch *batch,
608 struct crocus_bo *bo, uint32_t offset,
611 /* Can't use crocus_emit_cmd because MI_STORE_DATA_IMM has a length of
612 * 2 in genxml but it's actually variable length and we need 5 DWords.
614 void *map = crocus_get_command_space(batch, 4 * 5);
615 _crocus_pack_command(batch, GENX(MI_STORE_DATA_IMM), map, sdi) {
616 sdi.DWordLength = 5 - 2;
617 sdi.Address = rw_bo(bo, offset);
619 sdi.ImmediateData = imm;
626 crocus_copy_mem_mem(struct crocus_batch *batch,
627 struct crocus_bo *dst_bo, uint32_t dst_offset,
628 struct crocus_bo *src_bo, uint32_t src_offset,
631 assert(bytes % 4 == 0);
632 assert(dst_offset % 4 == 0);
633 assert(src_offset % 4 == 0);
635 #define CROCUS_TEMP_REG 0x2440 /* GEN7_3DPRIM_BASE_VERTEX */
636 for (unsigned i = 0; i < bytes; i += 4) {
637 crocus_load_register_mem32(batch, CROCUS_TEMP_REG,
638 src_bo, src_offset + i);
639 crocus_store_register_mem32(batch, CROCUS_TEMP_REG,
640 dst_bo, dst_offset + i, false);
646 * Gallium CSO for rasterizer state.
648 struct crocus_rasterizer_state {
649 struct pipe_rasterizer_state cso;
651 uint32_t sf[GENX(3DSTATE_SF_length)];
652 uint32_t clip[GENX(3DSTATE_CLIP_length)];
655 uint32_t raster[GENX(3DSTATE_RASTER_length)];
657 uint32_t line_stipple[GENX(3DSTATE_LINE_STIPPLE_length)];
659 uint8_t num_clip_plane_consts;
660 bool fill_mode_point_or_line;
670 static const struct {
671 uint32_t min_nr_entries;
672 uint32_t preferred_nr_entries;
673 uint32_t min_entry_size;
674 uint32_t max_entry_size;
675 } limits[URB_CS+1] = {
676 { 16, 32, 1, 5 }, /* vs */
677 { 4, 8, 1, 5 }, /* gs */
678 { 5, 10, 1, 5 }, /* clp */
679 { 1, 8, 1, 12 }, /* sf */
680 { 1, 4, 1, 32 } /* cs */
683 static bool check_urb_layout(struct crocus_context *ice)
685 ice->urb.vs_start = 0;
686 ice->urb.gs_start = ice->urb.nr_vs_entries * ice->urb.vsize;
687 ice->urb.clip_start = ice->urb.gs_start + ice->urb.nr_gs_entries * ice->urb.vsize;
688 ice->urb.sf_start = ice->urb.clip_start + ice->urb.nr_clip_entries * ice->urb.vsize;
689 ice->urb.cs_start = ice->urb.sf_start + ice->urb.nr_sf_entries * ice->urb.sfsize;
691 return ice->urb.cs_start + ice->urb.nr_cs_entries *
692 ice->urb.csize <= ice->urb.size;
697 crocus_calculate_urb_fence(struct crocus_batch *batch, unsigned csize,
698 unsigned vsize, unsigned sfsize)
700 struct crocus_context *ice = batch->ice;
701 if (csize < limits[URB_CS].min_entry_size)
702 csize = limits[URB_CS].min_entry_size;
704 if (vsize < limits[URB_VS].min_entry_size)
705 vsize = limits[URB_VS].min_entry_size;
707 if (sfsize < limits[URB_SF].min_entry_size)
708 sfsize = limits[URB_SF].min_entry_size;
710 if (ice->urb.vsize < vsize ||
711 ice->urb.sfsize < sfsize ||
712 ice->urb.csize < csize ||
713 (ice->urb.constrained && (ice->urb.vsize > vsize ||
714 ice->urb.sfsize > sfsize ||
715 ice->urb.csize > csize))) {
718 ice->urb.csize = csize;
719 ice->urb.sfsize = sfsize;
720 ice->urb.vsize = vsize;
722 ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
723 ice->urb.nr_gs_entries = limits[URB_GS].preferred_nr_entries;
724 ice->urb.nr_clip_entries = limits[URB_CLP].preferred_nr_entries;
725 ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries;
726 ice->urb.nr_cs_entries = limits[URB_CS].preferred_nr_entries;
728 ice->urb.constrained = 0;
731 ice->urb.nr_vs_entries = 128;
732 ice->urb.nr_sf_entries = 48;
733 if (check_urb_layout(ice)) {
736 ice->urb.constrained = 1;
737 ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
738 ice->urb.nr_sf_entries = limits[URB_SF].preferred_nr_entries;
740 } else if (GFX_VERx10 == 45) {
741 ice->urb.nr_vs_entries = 64;
742 if (check_urb_layout(ice)) {
745 ice->urb.constrained = 1;
746 ice->urb.nr_vs_entries = limits[URB_VS].preferred_nr_entries;
750 if (!check_urb_layout(ice)) {
751 ice->urb.nr_vs_entries = limits[URB_VS].min_nr_entries;
752 ice->urb.nr_gs_entries = limits[URB_GS].min_nr_entries;
753 ice->urb.nr_clip_entries = limits[URB_CLP].min_nr_entries;
754 ice->urb.nr_sf_entries = limits[URB_SF].min_nr_entries;
755 ice->urb.nr_cs_entries = limits[URB_CS].min_nr_entries;
757 /* Mark us as operating with constrained nr_entries, so that next
758 * time we recalculate we'll resize the fences in the hope of
759 * escaping constrained mode and getting back to normal performance.
761 ice->urb.constrained = 1;
763 if (!check_urb_layout(ice)) {
764 /* This is impossible, given the maximal sizes of urb
765 * entries and the values for minimum nr of entries
768 fprintf(stderr, "couldn't calculate URB layout!\n");
772 if (INTEL_DEBUG(DEBUG_URB|DEBUG_PERF))
773 fprintf(stderr, "URB CONSTRAINED\n");
777 if (INTEL_DEBUG(DEBUG_URB))
779 "URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
792 crocus_upload_urb_fence(struct crocus_batch *batch)
794 uint32_t urb_fence[3];
795 _crocus_pack_command(batch, GENX(URB_FENCE), urb_fence, urb) {
796 urb.VSUnitURBReallocationRequest = 1;
797 urb.GSUnitURBReallocationRequest = 1;
798 urb.CLIPUnitURBReallocationRequest = 1;
799 urb.SFUnitURBReallocationRequest = 1;
800 urb.VFEUnitURBReallocationRequest = 1;
801 urb.CSUnitURBReallocationRequest = 1;
803 urb.VSFence = batch->ice->urb.gs_start;
804 urb.GSFence = batch->ice->urb.clip_start;
805 urb.CLIPFence = batch->ice->urb.sf_start;
806 urb.SFFence = batch->ice->urb.cs_start;
807 urb.CSFence = batch->ice->urb.size;
810 /* erratum: URB_FENCE must not cross a 64byte cacheline */
811 if ((crocus_batch_bytes_used(batch) & 15) > 12) {
812 int pad = 16 - (crocus_batch_bytes_used(batch) & 15);
814 *(uint32_t *)batch->command.map_next = 0;
815 batch->command.map_next += sizeof(uint32_t);
819 crocus_batch_emit(batch, urb_fence, sizeof(uint32_t) * 3);
823 calculate_curbe_offsets(struct crocus_batch *batch)
825 struct crocus_context *ice = batch->ice;
827 unsigned nr_fp_regs, nr_vp_regs, nr_clip_regs = 0;
831 for (int i = 0; i < 4; i++) {
832 const struct brw_ubo_range *range = &ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data->ubo_ranges[i];
833 if (range->length == 0)
836 /* ubo range tracks at 256-bit, we need 512-bit */
837 nr_fp_regs += (range->length + 1) / 2;
840 if (ice->state.cso_rast->cso.clip_plane_enable) {
841 unsigned nr_planes = 6 + util_bitcount(ice->state.cso_rast->cso.clip_plane_enable);
842 nr_clip_regs = (nr_planes * 4 + 15) / 16;
846 for (int i = 0; i < 4; i++) {
847 const struct brw_ubo_range *range = &ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data->ubo_ranges[i];
848 if (range->length == 0)
851 /* ubo range tracks at 256-bit, we need 512-bit */
852 nr_vp_regs += (range->length + 1) / 2;
854 if (nr_vp_regs == 0) {
855 /* The pre-gen6 VS requires that some push constants get loaded no
856 * matter what, or the GPU would hang.
860 total_regs = nr_fp_regs + nr_vp_regs + nr_clip_regs;
862 /* The CURBE allocation size is limited to 32 512-bit units (128 EU
863 * registers, or 1024 floats). See CS_URB_STATE in the gen4 or gen5
864 * (volume 1, part 1) PRMs.
866 * Note that in brw_fs.cpp we're only loading up to 16 EU registers of
867 * values as push constants before spilling to pull constants, and in
868 * brw_vec4.cpp we're loading up to 32 registers of push constants. An EU
869 * register is 1/2 of one of these URB entry units, so that leaves us 16 EU
872 assert(total_regs <= 32);
876 if (nr_fp_regs > ice->curbe.wm_size ||
877 nr_vp_regs > ice->curbe.vs_size ||
878 nr_clip_regs != ice->curbe.clip_size ||
879 (total_regs < ice->curbe.total_size / 4 &&
880 ice->curbe.total_size > 16)) {
884 /* Calculate a new layout:
887 ice->curbe.wm_start = reg;
888 ice->curbe.wm_size = nr_fp_regs; reg += nr_fp_regs;
889 ice->curbe.clip_start = reg;
890 ice->curbe.clip_size = nr_clip_regs; reg += nr_clip_regs;
891 ice->curbe.vs_start = reg;
892 ice->curbe.vs_size = nr_vp_regs; reg += nr_vp_regs;
893 ice->curbe.total_size = reg;
896 fprintf(stderr, "curbe wm %d+%d clip %d+%d vs %d+%d\n",
899 ice->curbe.clip_start,
900 ice->curbe.clip_size,
902 ice->curbe.vs_size );
909 upload_shader_consts(struct crocus_context *ice,
910 gl_shader_stage stage,
914 struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
915 struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;
918 unsigned offset = start * 16;
920 for (int i = 0; i < 4; i++) {
921 const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
923 if (range->length == 0)
926 unsigned block_index = crocus_bti_to_group_index(
927 &shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block);
928 unsigned len = range->length * 8 * sizeof(float);
929 unsigned start = range->start * 8 * sizeof(float);
930 struct pipe_transfer *transfer;
932 cmap = pipe_buffer_map_range(&ice->ctx, ice->state.shaders[stage].constbufs[block_index].buffer,
933 ice->state.shaders[stage].constbufs[block_index].buffer_offset + start, len,
934 PIPE_MAP_READ | PIPE_MAP_UNSYNCHRONIZED, &transfer);
936 memcpy(&map[offset + (total * 8)], cmap, len);
937 pipe_buffer_unmap(&ice->ctx, transfer);
938 total += range->length;
942 if (stage == MESA_SHADER_VERTEX && !found) {
943 /* The pre-gen6 VS requires that some push constants get loaded no
944 * matter what, or the GPU would hang.
947 memset(&map[offset], 0, len);
951 static const float fixed_plane[6][4] = {
961 gen4_upload_curbe(struct crocus_batch *batch)
963 struct crocus_context *ice = batch->ice;
964 const unsigned sz = ice->curbe.total_size;
965 const unsigned buf_sz = sz * 16 * sizeof(float);
971 u_upload_alloc(ice->ctx.const_uploader, 0, buf_sz, 64,
972 &ice->curbe.curbe_offset, (struct pipe_resource **)&ice->curbe.curbe_res, (void **) &map);
974 /* fragment shader constants */
975 if (ice->curbe.wm_size) {
976 upload_shader_consts(ice, MESA_SHADER_FRAGMENT, map, ice->curbe.wm_start);
979 /* clipper constants */
980 if (ice->curbe.clip_size) {
981 unsigned offset = ice->curbe.clip_start * 16;
982 float *fmap = (float *)map;
984 /* If any planes are going this way, send them all this way:
986 for (i = 0; i < 6; i++) {
987 fmap[offset + i * 4 + 0] = fixed_plane[i][0];
988 fmap[offset + i * 4 + 1] = fixed_plane[i][1];
989 fmap[offset + i * 4 + 2] = fixed_plane[i][2];
990 fmap[offset + i * 4 + 3] = fixed_plane[i][3];
993 unsigned mask = ice->state.cso_rast->cso.clip_plane_enable;
994 struct pipe_clip_state *cp = &ice->state.clip_planes;
996 const int j = u_bit_scan(&mask);
997 fmap[offset + i * 4 + 0] = cp->ucp[j][0];
998 fmap[offset + i * 4 + 1] = cp->ucp[j][1];
999 fmap[offset + i * 4 + 2] = cp->ucp[j][2];
1000 fmap[offset + i * 4 + 3] = cp->ucp[j][3];
1005 /* vertex shader constants */
1006 if (ice->curbe.vs_size) {
1007 upload_shader_consts(ice, MESA_SHADER_VERTEX, map, ice->curbe.vs_start);
1010 for (int i = 0; i < sz*16; i+=4) {
1011 float *f = (float *)map;
1012 fprintf(stderr, "curbe %d.%d: %f %f %f %f\n", i/8, i&4,
1013 f[i+0], f[i+1], f[i+2], f[i+3]);
1018 crocus_emit_cmd(batch, GENX(CONSTANT_BUFFER), cb) {
1019 if (ice->curbe.curbe_res) {
1020 cb.BufferLength = ice->curbe.total_size - 1;
1022 cb.BufferStartingAddress = ro_bo(ice->curbe.curbe_res->bo, ice->curbe.curbe_offset);
1026 #if GFX_VER == 4 && GFX_VERx10 != 45
1027 /* Work around a Broadwater/Crestline depth interpolator bug. The
1028 * following sequence will cause GPU hangs:
1030 * 1. Change state so that all depth related fields in CC_STATE are
1031 * disabled, and in WM_STATE, only "PS Use Source Depth" is enabled.
1032 * 2. Emit a CONSTANT_BUFFER packet.
1033 * 3. Draw via 3DPRIMITIVE.
1035 * The recommended workaround is to emit a non-pipelined state change after
1036 * emitting CONSTANT_BUFFER, in order to drain the windowizer pipeline.
1038 * We arbitrarily choose 3DSTATE_GLOBAL_DEPTH_CLAMP_OFFSET (as it's small),
1039 * and always emit it when "PS Use Source Depth" is set. We could be more
1040 * precise, but the additional complexity is probably not worth it.
1043 const struct shader_info *fs_info =
1044 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
1046 if (BITSET_TEST(fs_info->system_values_read, SYSTEM_VALUE_FRAG_COORD)) {
1047 ice->state.global_depth_offset_clamp = 0;
1048 crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp);
1056 #define IVB_L3SQCREG1_SQGHPCI_DEFAULT 0x00730000
1057 #define VLV_L3SQCREG1_SQGHPCI_DEFAULT 0x00d30000
1058 #define HSW_L3SQCREG1_SQGHPCI_DEFAULT 0x00610000
1061 setup_l3_config(struct crocus_batch *batch, const struct intel_l3_config *cfg)
1064 const struct intel_device_info *devinfo = &batch->screen->devinfo;
1065 const bool has_dc = cfg->n[INTEL_L3P_DC] || cfg->n[INTEL_L3P_ALL];
1066 const bool has_is = cfg->n[INTEL_L3P_IS] || cfg->n[INTEL_L3P_RO] ||
1067 cfg->n[INTEL_L3P_ALL];
1068 const bool has_c = cfg->n[INTEL_L3P_C] || cfg->n[INTEL_L3P_RO] ||
1069 cfg->n[INTEL_L3P_ALL];
1070 const bool has_t = cfg->n[INTEL_L3P_T] || cfg->n[INTEL_L3P_RO] ||
1071 cfg->n[INTEL_L3P_ALL];
1072 const bool has_slm = cfg->n[INTEL_L3P_SLM];
1075 /* According to the hardware docs, the L3 partitioning can only be changed
1076 * while the pipeline is completely drained and the caches are flushed,
1077 * which involves a first PIPE_CONTROL flush which stalls the pipeline...
1079 crocus_emit_pipe_control_flush(batch, "l3_config",
1080 PIPE_CONTROL_DATA_CACHE_FLUSH |
1081 PIPE_CONTROL_CS_STALL);
1083 /* ...followed by a second pipelined PIPE_CONTROL that initiates
1084 * invalidation of the relevant caches. Note that because RO invalidation
1085 * happens at the top of the pipeline (i.e. right away as the PIPE_CONTROL
1086 * command is processed by the CS) we cannot combine it with the previous
1087 * stalling flush as the hardware documentation suggests, because that
1088 * would cause the CS to stall on previous rendering *after* RO
1089 * invalidation and wouldn't prevent the RO caches from being polluted by
1090 * concurrent rendering before the stall completes. This intentionally
1091 * doesn't implement the SKL+ hardware workaround suggesting to enable CS
1092 * stall on PIPE_CONTROLs with the texture cache invalidation bit set for
1093 * GPGPU workloads because the previous and subsequent PIPE_CONTROLs
1094 * already guarantee that there is no concurrent GPGPU kernel execution
1095 * (see SKL HSD 2132585).
1097 crocus_emit_pipe_control_flush(batch, "l3 config",
1098 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
1099 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
1100 PIPE_CONTROL_INSTRUCTION_INVALIDATE |
1101 PIPE_CONTROL_STATE_CACHE_INVALIDATE);
1103 /* Now send a third stalling flush to make sure that invalidation is
1104 * complete when the L3 configuration registers are modified.
1106 crocus_emit_pipe_control_flush(batch, "l3 config",
1107 PIPE_CONTROL_DATA_CACHE_FLUSH |
1108 PIPE_CONTROL_CS_STALL);
1111 assert(!cfg->n[INTEL_L3P_IS] && !cfg->n[INTEL_L3P_C] && !cfg->n[INTEL_L3P_T]);
1112 crocus_emit_reg(batch, GENX(L3CNTLREG), reg) {
1113 reg.SLMEnable = cfg->n[INTEL_L3P_SLM] > 0;
1114 reg.URBAllocation = cfg->n[INTEL_L3P_URB];
1115 reg.ROAllocation = cfg->n[INTEL_L3P_RO];
1116 reg.DCAllocation = cfg->n[INTEL_L3P_DC];
1117 reg.AllAllocation = cfg->n[INTEL_L3P_ALL];
1120 assert(!cfg->n[INTEL_L3P_ALL]);
1122 /* When enabled SLM only uses a portion of the L3 on half of the banks,
1123 * the matching space on the remaining banks has to be allocated to a
1124 * client (URB for all validated configurations) set to the
1125 * lower-bandwidth 2-bank address hashing mode.
1127 const bool urb_low_bw = has_slm && devinfo->platform != INTEL_PLATFORM_BYT;
1128 assert(!urb_low_bw || cfg->n[INTEL_L3P_URB] == cfg->n[INTEL_L3P_SLM]);
1130 /* Minimum number of ways that can be allocated to the URB. */
1131 const unsigned n0_urb = (devinfo->platform == INTEL_PLATFORM_BYT ? 32 : 0);
1132 assert(cfg->n[INTEL_L3P_URB] >= n0_urb);
1134 uint32_t l3sqcr1, l3cr2, l3cr3;
1136 crocus_pack_state(GENX(L3SQCREG1), &l3sqcr1, reg) {
1137 reg.ConvertDC_UC = !has_dc;
1138 reg.ConvertIS_UC = !has_is;
1139 reg.ConvertC_UC = !has_c;
1140 reg.ConvertT_UC = !has_t;
1141 #if GFX_VERx10 == 75
1142 reg.L3SQGeneralPriorityCreditInitialization = SQGPCI_DEFAULT;
1144 reg.L3SQGeneralPriorityCreditInitialization =
1145 devinfo->platform == INTEL_PLATFORM_BYT ? BYT_SQGPCI_DEFAULT : SQGPCI_DEFAULT;
1147 reg.L3SQHighPriorityCreditInitialization = SQHPCI_DEFAULT;
1150 crocus_pack_state(GENX(L3CNTLREG2), &l3cr2, reg) {
1151 reg.SLMEnable = has_slm;
1152 reg.URBLowBandwidth = urb_low_bw;
1153 reg.URBAllocation = cfg->n[INTEL_L3P_URB] - n0_urb;
1154 #if !(GFX_VERx10 == 75)
1155 reg.ALLAllocation = cfg->n[INTEL_L3P_ALL];
1157 reg.ROAllocation = cfg->n[INTEL_L3P_RO];
1158 reg.DCAllocation = cfg->n[INTEL_L3P_DC];
1161 crocus_pack_state(GENX(L3CNTLREG3), &l3cr3, reg) {
1162 reg.ISAllocation = cfg->n[INTEL_L3P_IS];
1163 reg.ISLowBandwidth = 0;
1164 reg.CAllocation = cfg->n[INTEL_L3P_C];
1165 reg.CLowBandwidth = 0;
1166 reg.TAllocation = cfg->n[INTEL_L3P_T];
1167 reg.TLowBandwidth = 0;
1170 /* Set up the L3 partitioning. */
1171 crocus_emit_lri(batch, L3SQCREG1, l3sqcr1);
1172 crocus_emit_lri(batch, L3CNTLREG2, l3cr2);
1173 crocus_emit_lri(batch, L3CNTLREG3, l3cr3);
1175 #if GFX_VERSIONx10 == 75
1176 /* TODO: Fail screen creation if command parser version < 4 */
1177 uint32_t scratch1, chicken3;
1178 crocus_pack_state(GENX(SCRATCH1), &scratch1, reg) {
1179 reg.L3AtomicDisable = !has_dc;
1181 crocus_pack_state(GENX(CHICKEN3), &chicken3, reg) {
1182 reg.L3AtomicDisableMask = true;
1183 reg.L3AtomicDisable = !has_dc;
1185 crocus_emit_lri(batch, SCRATCH1, scratch1);
1186 crocus_emit_lri(batch, CHICKEN3, chicken3);
1192 emit_l3_state(struct crocus_batch *batch, bool compute)
1194 const struct intel_l3_config *const cfg =
1195 compute ? batch->screen->l3_config_cs : batch->screen->l3_config_3d;
1197 setup_l3_config(batch, cfg);
1198 if (INTEL_DEBUG(DEBUG_L3)) {
1199 intel_dump_l3_config(cfg, stderr);
1204 * Emit a PIPE_CONTROL command for gen7 with the CS Stall bit set.
1207 gen7_emit_cs_stall_flush(struct crocus_batch *batch)
1209 crocus_emit_pipe_control_write(batch,
1211 PIPE_CONTROL_CS_STALL
1212 | PIPE_CONTROL_WRITE_IMMEDIATE,
1213 batch->ice->workaround_bo,
1214 batch->ice->workaround_offset, 0);
1219 emit_pipeline_select(struct crocus_batch *batch, uint32_t pipeline)
1222 /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
1224 * Software must clear the COLOR_CALC_STATE Valid field in
1225 * 3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
1226 * with Pipeline Select set to GPGPU.
1228 * The internal hardware docs recommend the same workaround for Gfx9
1231 if (pipeline == GPGPU)
1232 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
1236 /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
1237 * PIPELINE_SELECT [DevBWR+]":
1241 * Software must ensure all the write caches are flushed through a
1242 * stalling PIPE_CONTROL command followed by another PIPE_CONTROL
1243 * command to invalidate read only caches prior to programming
1244 * MI_PIPELINE_SELECT command to change the Pipeline Select Mode."
1246 const unsigned dc_flush =
1247 GFX_VER >= 7 ? PIPE_CONTROL_DATA_CACHE_FLUSH : 0;
1248 crocus_emit_pipe_control_flush(batch,
1249 "workaround: PIPELINE_SELECT flushes (1/2)",
1250 PIPE_CONTROL_RENDER_TARGET_FLUSH |
1251 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1253 PIPE_CONTROL_CS_STALL);
1255 crocus_emit_pipe_control_flush(batch,
1256 "workaround: PIPELINE_SELECT flushes (2/2)",
1257 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
1258 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
1259 PIPE_CONTROL_STATE_CACHE_INVALIDATE |
1260 PIPE_CONTROL_INSTRUCTION_INVALIDATE);
1262 /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
1263 * PIPELINE_SELECT [DevBWR+]":
1265 * Project: PRE-DEVSNB
1267 * Software must ensure the current pipeline is flushed via an
1268 * MI_FLUSH or PIPE_CONTROL prior to the execution of PIPELINE_SELECT.
1270 crocus_emit_cmd(batch, GENX(MI_FLUSH), foo);
1273 crocus_emit_cmd(batch, GENX(PIPELINE_SELECT), sel) {
1274 sel.PipelineSelection = pipeline;
1277 #if GFX_VER == 7 && !(GFX_VERx10 == 75)
1278 if (pipeline == _3D) {
1279 gen7_emit_cs_stall_flush(batch);
1281 crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
1282 prim.PrimitiveTopologyType = _3DPRIM_POINTLIST;
1289 * The following diagram shows how we partition the URB:
1291 * 16kB or 32kB Rest of the URB space
1292 * __________-__________ _________________-_________________
1294 * +-------------------------------------------------------------+
1295 * | VS/HS/DS/GS/FS Push | VS/HS/DS/GS URB |
1296 * | Constants | Entries |
1297 * +-------------------------------------------------------------+
1299 * Notably, push constants must be stored at the beginning of the URB
1300 * space, while entries can be stored anywhere. Ivybridge and Haswell
1301 * GT1/GT2 have a maximum constant buffer size of 16kB, while Haswell GT3
1302 * doubles this (32kB).
1304 * Ivybridge and Haswell GT1/GT2 allow push constants to be located (and
1305 * sized) in increments of 1kB. Haswell GT3 requires them to be located and
1306 * sized in increments of 2kB.
1308 * Currently we split the constant buffer space evenly among whatever stages
1309 * are active. This is probably not ideal, but simple.
1311 * Ivybridge GT1 and Haswell GT1 have 128kB of URB space.
1312 * Ivybridge GT2 and Haswell GT2 have 256kB of URB space.
1313 * Haswell GT3 has 512kB of URB space.
1315 * See "Volume 2a: 3D Pipeline," section 1.8, "Volume 1b: Configurations",
1316 * and the documentation for 3DSTATE_PUSH_CONSTANT_ALLOC_xS.
1320 crocus_alloc_push_constants(struct crocus_batch *batch)
1322 const unsigned push_constant_kb =
1323 batch->screen->devinfo.max_constant_urb_size_kb;
1324 unsigned size_per_stage = push_constant_kb / 5;
1326 /* For now, we set a static partitioning of the push constant area,
1327 * assuming that all stages could be in use.
1329 * TODO: Try lazily allocating the HS/DS/GS sections as needed, and
1330 * see if that improves performance by offering more space to
1331 * the VS/FS when those aren't in use. Also, try dynamically
1332 * enabling/disabling it like i965 does. This would be more
1333 * stalls and may not actually help; we don't know yet.
1335 for (int i = 0; i <= MESA_SHADER_FRAGMENT; i++) {
1336 crocus_emit_cmd(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
1337 alloc._3DCommandSubOpcode = 18 + i;
1338 alloc.ConstantBufferOffset = size_per_stage * i;
1339 alloc.ConstantBufferSize = i == MESA_SHADER_FRAGMENT ? (push_constant_kb - 4 * size_per_stage) : size_per_stage;
1343 /* From p292 of the Ivy Bridge PRM (11.2.4 3DSTATE_PUSH_CONSTANT_ALLOC_PS):
1345 * A PIPE_CONTROL command with the CS Stall bit set must be programmed
1346 * in the ring after this instruction.
1348 * No such restriction exists for Haswell or Baytrail.
1350 if (batch->screen->devinfo.platform == INTEL_PLATFORM_IVB)
1351 gen7_emit_cs_stall_flush(batch);
1356 * Upload the initial GPU state for a render context.
1358 * This sets some invariant state that needs to be programmed a particular
1359 * way, but we never actually change.
1362 crocus_init_render_context(struct crocus_batch *batch)
1364 UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
1366 emit_pipeline_select(batch, _3D);
1368 crocus_emit_cmd(batch, GENX(STATE_SIP), foo);
1371 emit_l3_state(batch, false);
1373 #if (GFX_VERx10 == 70 || GFX_VERx10 == 80)
1374 crocus_emit_reg(batch, GENX(INSTPM), reg) {
1375 reg.CONSTANT_BUFFERAddressOffsetDisable = true;
1376 reg.CONSTANT_BUFFERAddressOffsetDisableMask = true;
1379 #if GFX_VER >= 5 || GFX_VERx10 == 45
1380 /* Use the legacy AA line coverage computation. */
1381 crocus_emit_cmd(batch, GENX(3DSTATE_AA_LINE_PARAMETERS), foo);
1384 /* No polygon stippling offsets are necessary. */
1385 /* TODO: may need to set an offset for origin-UL framebuffers */
1386 crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_OFFSET), foo);
1389 crocus_alloc_push_constants(batch);
1393 /* Set the initial MSAA sample positions. */
1394 crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLE_PATTERN), pat) {
1395 INTEL_SAMPLE_POS_1X(pat._1xSample);
1396 INTEL_SAMPLE_POS_2X(pat._2xSample);
1397 INTEL_SAMPLE_POS_4X(pat._4xSample);
1398 INTEL_SAMPLE_POS_8X(pat._8xSample);
1401 /* Disable chromakeying (it's for media) */
1402 crocus_emit_cmd(batch, GENX(3DSTATE_WM_CHROMAKEY), foo);
1404 /* We want regular rendering, not special HiZ operations. */
1405 crocus_emit_cmd(batch, GENX(3DSTATE_WM_HZ_OP), foo);
1411 crocus_init_compute_context(struct crocus_batch *batch)
1413 UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
1415 emit_pipeline_select(batch, GPGPU);
1418 emit_l3_state(batch, true);
1424 * Generation-specific context state (ice->state.genx->...).
1426 * Most state can go in crocus_context directly, but these encode hardware
1427 * packets which vary by generation.
1429 struct crocus_genx_state {
1432 struct brw_image_param image_param[PIPE_MAX_SHADER_IMAGES];
1434 } shaders[MESA_SHADER_STAGES];
1437 bool pma_fix_enabled;
1442 * The pipe->set_blend_color() driver hook.
1444 * This corresponds to our COLOR_CALC_STATE.
1447 crocus_set_blend_color(struct pipe_context *ctx,
1448 const struct pipe_blend_color *state)
1450 struct crocus_context *ice = (struct crocus_context *) ctx;
1452 /* Our COLOR_CALC_STATE is exactly pipe_blend_color, so just memcpy */
1453 memcpy(&ice->state.blend_color, state, sizeof(struct pipe_blend_color));
1455 ice->state.dirty |= CROCUS_DIRTY_GEN4_CONSTANT_COLOR;
1457 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1462 * Gallium CSO for blend state (see pipe_blend_state).
1464 struct crocus_blend_state {
1466 /** Partial 3DSTATE_PS_BLEND */
1467 uint32_t ps_blend[GENX(3DSTATE_PS_BLEND_length)];
1470 /** copy of BLEND_STATE */
1471 struct pipe_blend_state cso;
1473 /** Bitfield of whether blending is enabled for RT[i] - for aux resolves */
1474 uint8_t blend_enables;
1476 /** Bitfield of whether color writes are enabled for RT[i] */
1477 uint8_t color_write_enables;
1479 /** Does RT[0] use dual color blending? */
1480 bool dual_color_blending;
1483 static enum pipe_blendfactor
1484 fix_blendfactor(enum pipe_blendfactor f, bool alpha_to_one)
1487 if (f == PIPE_BLENDFACTOR_SRC1_ALPHA)
1488 return PIPE_BLENDFACTOR_ONE;
1490 if (f == PIPE_BLENDFACTOR_INV_SRC1_ALPHA)
1491 return PIPE_BLENDFACTOR_ZERO;
1498 typedef struct GENX(BLEND_STATE_ENTRY) BLEND_ENTRY_GENXML;
1500 typedef struct GENX(COLOR_CALC_STATE) BLEND_ENTRY_GENXML;
1504 can_emit_logic_op(struct crocus_context *ice)
1506 /* all pre gen8 have logicop restricted to unorm */
1507 enum pipe_format pformat = PIPE_FORMAT_NONE;
1508 for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) {
1509 if (ice->state.framebuffer.cbufs[i]) {
1510 pformat = ice->state.framebuffer.cbufs[i]->format;
1514 return (pformat == PIPE_FORMAT_NONE || util_format_is_unorm(pformat));
1518 set_blend_entry_bits(struct crocus_batch *batch, BLEND_ENTRY_GENXML *entry,
1519 struct crocus_blend_state *cso_blend,
1522 struct crocus_context *ice = batch->ice;
1523 bool independent_alpha_blend = false;
1524 const struct pipe_rt_blend_state *rt =
1525 &cso_blend->cso.rt[cso_blend->cso.independent_blend_enable ? idx : 0];
1526 const unsigned blend_enabled = rt->blend_enable;
1528 enum pipe_blendfactor src_rgb =
1529 fix_blendfactor(rt->rgb_src_factor, cso_blend->cso.alpha_to_one);
1530 enum pipe_blendfactor src_alpha =
1531 fix_blendfactor(rt->alpha_src_factor, cso_blend->cso.alpha_to_one);
1532 enum pipe_blendfactor dst_rgb =
1533 fix_blendfactor(rt->rgb_dst_factor, cso_blend->cso.alpha_to_one);
1534 enum pipe_blendfactor dst_alpha =
1535 fix_blendfactor(rt->alpha_dst_factor, cso_blend->cso.alpha_to_one);
1537 if (rt->rgb_func != rt->alpha_func ||
1538 src_rgb != src_alpha || dst_rgb != dst_alpha)
1539 independent_alpha_blend = true;
1540 if (cso_blend->cso.logicop_enable) {
1541 if (GFX_VER >= 8 || can_emit_logic_op(ice)) {
1542 entry->LogicOpEnable = cso_blend->cso.logicop_enable;
1543 entry->LogicOpFunction = cso_blend->cso.logicop_func;
1545 } else if (blend_enabled) {
1547 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
1548 struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
1549 entry->ColorBufferBlendEnable =
1550 (!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend);
1552 entry->ColorBufferBlendEnable = 1;
1554 entry->ColorBlendFunction = rt->rgb_func;
1555 entry->AlphaBlendFunction = rt->alpha_func;
1556 entry->SourceBlendFactor = (int) src_rgb;
1557 entry->SourceAlphaBlendFactor = (int) src_alpha;
1558 entry->DestinationBlendFactor = (int) dst_rgb;
1559 entry->DestinationAlphaBlendFactor = (int) dst_alpha;
1563 * Gen4/GM45/ILK can't handle have ColorBufferBlendEnable == 0
1564 * when a dual src blend shader is in use. Setup dummy blending.
1566 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
1567 struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
1568 if (idx == 0 && !blend_enabled && wm_prog_data->dual_src_blend) {
1569 entry->ColorBufferBlendEnable = 1;
1570 entry->ColorBlendFunction = PIPE_BLEND_ADD;
1571 entry->AlphaBlendFunction = PIPE_BLEND_ADD;
1572 entry->SourceBlendFactor = PIPE_BLENDFACTOR_ONE;
1573 entry->SourceAlphaBlendFactor = PIPE_BLENDFACTOR_ONE;
1574 entry->DestinationBlendFactor = PIPE_BLENDFACTOR_ZERO;
1575 entry->DestinationAlphaBlendFactor = PIPE_BLENDFACTOR_ZERO;
1578 return independent_alpha_blend;
1582 * The pipe->create_blend_state() driver hook.
1584 * Translates a pipe_blend_state into crocus_blend_state.
1587 crocus_create_blend_state(struct pipe_context *ctx,
1588 const struct pipe_blend_state *state)
1590 struct crocus_blend_state *cso = malloc(sizeof(struct crocus_blend_state));
1592 cso->blend_enables = 0;
1593 cso->color_write_enables = 0;
1594 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS <= 8);
1597 cso->dual_color_blending = util_blend_state_is_dual(state, 0);
1600 bool indep_alpha_blend = false;
1602 for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) {
1603 const struct pipe_rt_blend_state *rt =
1604 &state->rt[state->independent_blend_enable ? i : 0];
1605 if (rt->blend_enable)
1606 cso->blend_enables |= 1u << i;
1608 cso->color_write_enables |= 1u << i;
1610 enum pipe_blendfactor src_rgb =
1611 fix_blendfactor(rt->rgb_src_factor, state->alpha_to_one);
1612 enum pipe_blendfactor src_alpha =
1613 fix_blendfactor(rt->alpha_src_factor, state->alpha_to_one);
1614 enum pipe_blendfactor dst_rgb =
1615 fix_blendfactor(rt->rgb_dst_factor, state->alpha_to_one);
1616 enum pipe_blendfactor dst_alpha =
1617 fix_blendfactor(rt->alpha_dst_factor, state->alpha_to_one);
1619 if (rt->rgb_func != rt->alpha_func ||
1620 src_rgb != src_alpha || dst_rgb != dst_alpha)
1621 indep_alpha_blend = true;
1626 crocus_pack_command(GENX(3DSTATE_PS_BLEND), cso->ps_blend, pb) {
1627 /* pb.HasWriteableRT is filled in at draw time.
1628 * pb.AlphaTestEnable is filled in at draw time.
1630 * pb.ColorBufferBlendEnable is filled in at draw time so we can avoid
1631 * setting it when dual color blending without an appropriate shader.
1634 pb.AlphaToCoverageEnable = state->alpha_to_coverage;
1635 pb.IndependentAlphaBlendEnable = indep_alpha_blend;
1637 /* The casts prevent warnings about implicit enum type conversions. */
1638 pb.SourceBlendFactor =
1639 (int) fix_blendfactor(state->rt[0].rgb_src_factor, state->alpha_to_one);
1640 pb.SourceAlphaBlendFactor =
1641 (int) fix_blendfactor(state->rt[0].alpha_src_factor, state->alpha_to_one);
1642 pb.DestinationBlendFactor =
1643 (int) fix_blendfactor(state->rt[0].rgb_dst_factor, state->alpha_to_one);
1644 pb.DestinationAlphaBlendFactor =
1645 (int) fix_blendfactor(state->rt[0].alpha_dst_factor, state->alpha_to_one);
1652 * The pipe->bind_blend_state() driver hook.
1654 * Bind a blending CSO and flag related dirty bits.
1657 crocus_bind_blend_state(struct pipe_context *ctx, void *state)
1659 struct crocus_context *ice = (struct crocus_context *) ctx;
1660 struct crocus_blend_state *cso = state;
1662 ice->state.cso_blend = cso;
1663 ice->state.blend_enables = cso ? cso->blend_enables : 0;
1665 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS;
1666 ice->state.dirty |= CROCUS_DIRTY_WM;
1668 ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1671 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS;
1674 ice->state.dirty |= CROCUS_DIRTY_GEN8_PMA_FIX;
1675 ice->state.dirty |= CROCUS_DIRTY_GEN8_PS_BLEND;
1677 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1678 ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
1679 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_BLEND];
1683 * Return true if the FS writes to any color outputs which are not disabled
1684 * via color masking.
1687 has_writeable_rt(const struct crocus_blend_state *cso_blend,
1688 const struct shader_info *fs_info)
1693 unsigned rt_outputs = fs_info->outputs_written >> FRAG_RESULT_DATA0;
1695 if (fs_info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR))
1696 rt_outputs = (1 << BRW_MAX_DRAW_BUFFERS) - 1;
1698 return cso_blend->color_write_enables & rt_outputs;
1702 * Gallium CSO for depth, stencil, and alpha testing state.
1704 struct crocus_depth_stencil_alpha_state {
1705 struct pipe_depth_stencil_alpha_state cso;
1707 bool depth_writes_enabled;
1708 bool stencil_writes_enabled;
1712 * The pipe->create_depth_stencil_alpha_state() driver hook.
1714 * We encode most of 3DSTATE_WM_DEPTH_STENCIL, and just save off the alpha
1715 * testing state since we need pieces of it in a variety of places.
1718 crocus_create_zsa_state(struct pipe_context *ctx,
1719 const struct pipe_depth_stencil_alpha_state *state)
1721 struct crocus_depth_stencil_alpha_state *cso =
1722 malloc(sizeof(struct crocus_depth_stencil_alpha_state));
1724 bool two_sided_stencil = state->stencil[1].enabled;
1727 cso->depth_writes_enabled = state->depth_writemask;
1728 cso->stencil_writes_enabled =
1729 state->stencil[0].writemask != 0 ||
1730 (two_sided_stencil && state->stencil[1].writemask != 0);
1732 /* The state tracker needs to optimize away EQUAL writes for us. */
1733 assert(!(state->depth_func == PIPE_FUNC_EQUAL && state->depth_writemask));
1739 * The pipe->bind_depth_stencil_alpha_state() driver hook.
1741 * Bind a depth/stencil/alpha CSO and flag related dirty bits.
1744 crocus_bind_zsa_state(struct pipe_context *ctx, void *state)
1746 struct crocus_context *ice = (struct crocus_context *) ctx;
1747 struct crocus_depth_stencil_alpha_state *old_cso = ice->state.cso_zsa;
1748 struct crocus_depth_stencil_alpha_state *new_cso = state;
1751 if (cso_changed(cso.alpha_ref_value))
1752 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1754 if (cso_changed(cso.alpha_enabled))
1755 ice->state.dirty |= CROCUS_DIRTY_WM;
1757 if (cso_changed(cso.alpha_enabled))
1758 ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1760 if (cso_changed(cso.alpha_func))
1761 ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
1764 if (cso_changed(cso.alpha_enabled))
1765 ice->state.dirty |= CROCUS_DIRTY_GEN8_PS_BLEND;
1768 if (cso_changed(depth_writes_enabled))
1769 ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
1771 ice->state.depth_writes_enabled = new_cso->depth_writes_enabled;
1772 ice->state.stencil_writes_enabled = new_cso->stencil_writes_enabled;
1775 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
1779 ice->state.cso_zsa = new_cso;
1780 ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
1782 ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL;
1785 ice->state.dirty |= CROCUS_DIRTY_GEN8_PMA_FIX;
1787 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_DEPTH_STENCIL_ALPHA];
1792 want_pma_fix(struct crocus_context *ice)
1794 UNUSED struct crocus_screen *screen = (void *) ice->ctx.screen;
1795 UNUSED const struct intel_device_info *devinfo = &screen->devinfo;
1796 const struct brw_wm_prog_data *wm_prog_data = (void *)
1797 ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
1798 const struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
1799 const struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
1800 const struct crocus_blend_state *cso_blend = ice->state.cso_blend;
1802 /* In very specific combinations of state, we can instruct Gfx8-9 hardware
1803 * to avoid stalling at the pixel mask array. The state equations are
1804 * documented in these places:
1806 * - Gfx8 Depth PMA Fix: CACHE_MODE_1::NP_PMA_FIX_ENABLE
1807 * - Gfx9 Stencil PMA Fix: CACHE_MODE_0::STC PMA Optimization Enable
1809 * Both equations share some common elements:
1812 * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
1813 * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
1814 * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
1815 * 3DSTATE_WM_HZ_OP::StencilBufferClear) &&
1818 * 3DSTATE_WM::ForceKillPix != ForceOff &&
1819 * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
1820 * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
1821 * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
1822 * 3DSTATE_PS_BLEND::AlphaTestEnable ||
1823 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
1825 * (Technically the stencil PMA treats ForceKillPix differently,
1826 * but I think this is a documentation oversight, and we don't
1827 * ever use it in this way, so it doesn't matter).
1830 * 3DSTATE_WM::ForceThreadDispatch != 1 &&
1831 * 3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0 &&
1832 * 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
1833 * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
1834 * 3DSTATE_WM::EDSC_Mode != EDSC_PREPS &&
1835 * 3DSTATE_PS_EXTRA::PixelShaderValid &&
1838 * These are always true:
1840 * 3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0
1841 * 3DSTATE_PS_EXTRA::PixelShaderValid
1843 * Also, we never use the normal drawing path for HiZ ops; these are true:
1845 * !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
1846 * 3DSTATE_WM_HZ_OP::DepthBufferResolve ||
1847 * 3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
1848 * 3DSTATE_WM_HZ_OP::StencilBufferClear)
1850 * This happens sometimes:
1852 * 3DSTATE_WM::ForceThreadDispatch != 1
1854 * However, we choose to ignore it as it either agrees with the signal
1855 * (dispatch was already enabled, so nothing out of the ordinary), or
1856 * there are no framebuffer attachments (so no depth or HiZ anyway,
1857 * meaning the PMA signal will already be disabled).
1863 struct crocus_resource *zres, *sres;
1864 crocus_get_depth_stencil_resources(devinfo,
1865 cso_fb->zsbuf->texture, &zres, &sres);
1867 /* 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
1868 * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
1870 if (!zres || !crocus_resource_level_has_hiz(zres, cso_fb->zsbuf->u.tex.level))
1873 /* 3DSTATE_WM::EDSC_Mode != EDSC_PREPS */
1874 if (wm_prog_data->early_fragment_tests)
1877 /* 3DSTATE_WM::ForceKillPix != ForceOff &&
1878 * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
1879 * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
1880 * 3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
1881 * 3DSTATE_PS_BLEND::AlphaTestEnable ||
1882 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
1884 bool killpixels = wm_prog_data->uses_kill || wm_prog_data->uses_omask ||
1885 cso_blend->cso.alpha_to_coverage || cso_zsa->cso.alpha_enabled;
1887 /* The Gfx8 depth PMA equation becomes:
1890 * 3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
1891 * 3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE
1894 * 3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
1895 * 3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
1896 * 3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE
1900 * 3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable &&
1901 * ((killpixels && (depth_writes || stencil_writes)) ||
1902 * 3DSTATE_PS_EXTRA::PixelShaderComputedDepthMode != PSCDEPTH_OFF)
1905 if (!cso_zsa->cso.depth_enabled)
1908 return wm_prog_data->computed_depth_mode != PSCDEPTH_OFF ||
1909 (killpixels && (cso_zsa->depth_writes_enabled ||
1910 (sres && cso_zsa->stencil_writes_enabled)));
1914 genX(crocus_update_pma_fix)(struct crocus_context *ice,
1915 struct crocus_batch *batch,
1919 struct crocus_genx_state *genx = ice->state.genx;
1921 if (genx->pma_fix_enabled == enable)
1924 genx->pma_fix_enabled = enable;
1926 /* According to the Broadwell PIPE_CONTROL documentation, software should
1927 * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
1928 * prior to the LRI. If stencil buffer writes are enabled, then a Render * Cache Flush is also necessary.
1930 * The Gfx9 docs say to use a depth stall rather than a command streamer
1931 * stall. However, the hardware seems to violently disagree. A full
1932 * command streamer stall seems to be needed in both cases.
1934 crocus_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
1935 PIPE_CONTROL_CS_STALL |
1936 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1937 PIPE_CONTROL_RENDER_TARGET_FLUSH);
1939 crocus_emit_reg(batch, GENX(CACHE_MODE_1), reg) {
1940 reg.NPPMAFixEnable = enable;
1941 reg.NPEarlyZFailsDisable = enable;
1942 reg.NPPMAFixEnableMask = true;
1943 reg.NPEarlyZFailsDisableMask = true;
1946 /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
1947 * Flush bits is often necessary. We do it regardless because it's easier.
1948 * The render cache flush is also necessary if stencil writes are enabled.
1950 * Again, the Gfx9 docs give a different set of flushes but the Broadwell
1951 * flushes seem to work just as well.
1953 crocus_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
1954 PIPE_CONTROL_DEPTH_STALL |
1955 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1956 PIPE_CONTROL_RENDER_TARGET_FLUSH);
1961 get_line_width(const struct pipe_rasterizer_state *state)
1963 float line_width = state->line_width;
1965 /* From the OpenGL 4.4 spec:
1967 * "The actual width of non-antialiased lines is determined by rounding
1968 * the supplied width to the nearest integer, then clamping it to the
1969 * implementation-dependent maximum non-antialiased line width."
1971 if (!state->multisample && !state->line_smooth)
1972 line_width = roundf(state->line_width);
1974 if (!state->multisample && state->line_smooth && line_width < 1.5f) {
1975 /* For 1 pixel line thickness or less, the general anti-aliasing
1976 * algorithm gives up, and a garbage line is generated. Setting a
1977 * Line Width of 0.0 specifies the rasterization of the "thinnest"
1978 * (one-pixel-wide), non-antialiased lines.
1980 * Lines rendered with zero Line Width are rasterized using the
1981 * "Grid Intersection Quantization" rules as specified by the
1982 * "Zero-Width (Cosmetic) Line Rasterization" section of the docs.
1984 /* hack around this for gfx4/5 fps counters in hud. */
1985 line_width = GFX_VER < 6 ? 1.5f : 0.0f;
1991 * The pipe->create_rasterizer_state() driver hook.
1994 crocus_create_rasterizer_state(struct pipe_context *ctx,
1995 const struct pipe_rasterizer_state *state)
1997 struct crocus_rasterizer_state *cso =
1998 malloc(sizeof(struct crocus_rasterizer_state));
2000 cso->fill_mode_point_or_line =
2001 state->fill_front == PIPE_POLYGON_MODE_LINE ||
2002 state->fill_front == PIPE_POLYGON_MODE_POINT ||
2003 state->fill_back == PIPE_POLYGON_MODE_LINE ||
2004 state->fill_back == PIPE_POLYGON_MODE_POINT;
2006 if (state->clip_plane_enable != 0)
2007 cso->num_clip_plane_consts = util_logbase2(state->clip_plane_enable) + 1;
2009 cso->num_clip_plane_consts = 0;
2014 float line_width = get_line_width(state);
2016 crocus_pack_command(GENX(3DSTATE_SF), cso->sf, sf) {
2017 sf.StatisticsEnable = true;
2018 sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
2019 sf.LineEndCapAntialiasingRegionWidth =
2020 state->line_smooth ? _10pixels : _05pixels;
2021 sf.LastPixelEnable = state->line_last_pixel;
2023 sf.AntialiasingEnable = state->line_smooth;
2026 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2027 if (screen->devinfo.platform == INTEL_PLATFORM_CHV)
2028 sf.CHVLineWidth = line_width;
2030 sf.LineWidth = line_width;
2032 sf.LineWidth = line_width;
2034 sf.PointWidthSource = state->point_size_per_vertex ? Vertex : State;
2035 sf.PointWidth = state->point_size;
2037 if (state->flatshade_first) {
2038 sf.TriangleFanProvokingVertexSelect = 1;
2040 sf.TriangleStripListProvokingVertexSelect = 2;
2041 sf.TriangleFanProvokingVertexSelect = 2;
2042 sf.LineStripListProvokingVertexSelect = 1;
2046 sf.AttributeSwizzleEnable = true;
2047 if (state->sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT)
2048 sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
2050 sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
2054 sf.FrontWinding = state->front_ccw ? 1 : 0; // Or the other way...
2057 sf.GlobalDepthOffsetEnableSolid = state->offset_tri;
2058 sf.GlobalDepthOffsetEnableWireframe = state->offset_line;
2059 sf.GlobalDepthOffsetEnablePoint = state->offset_point;
2060 sf.GlobalDepthOffsetConstant = state->offset_units * 2;
2061 sf.GlobalDepthOffsetScale = state->offset_scale;
2062 sf.GlobalDepthOffsetClamp = state->offset_clamp;
2064 sf.FrontFaceFillMode = translate_fill_mode(state->fill_front);
2065 sf.BackFaceFillMode = translate_fill_mode(state->fill_back);
2068 sf.CullMode = translate_cull_mode(state->cull_face);
2069 sf.ScissorRectangleEnable = true;
2071 #if GFX_VERx10 == 75
2072 sf.LineStippleEnable = state->line_stipple_enable;
2079 crocus_pack_command(GENX(3DSTATE_RASTER), cso->raster, rr) {
2080 rr.FrontWinding = state->front_ccw ? CounterClockwise : Clockwise;
2081 rr.CullMode = translate_cull_mode(state->cull_face);
2082 rr.FrontFaceFillMode = translate_fill_mode(state->fill_front);
2083 rr.BackFaceFillMode = translate_fill_mode(state->fill_back);
2084 rr.DXMultisampleRasterizationEnable = state->multisample;
2085 rr.GlobalDepthOffsetEnableSolid = state->offset_tri;
2086 rr.GlobalDepthOffsetEnableWireframe = state->offset_line;
2087 rr.GlobalDepthOffsetEnablePoint = state->offset_point;
2088 rr.GlobalDepthOffsetConstant = state->offset_units * 2;
2089 rr.GlobalDepthOffsetScale = state->offset_scale;
2090 rr.GlobalDepthOffsetClamp = state->offset_clamp;
2091 rr.SmoothPointEnable = state->point_smooth;
2092 rr.AntialiasingEnable = state->line_smooth;
2093 rr.ScissorRectangleEnable = state->scissor;
2094 rr.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
2099 crocus_pack_command(GENX(3DSTATE_CLIP), cso->clip, cl) {
2100 /* cl.NonPerspectiveBarycentricEnable is filled in at draw time from
2101 * the FS program; cl.ForceZeroRTAIndexEnable is filled in from the FB.
2104 cl.EarlyCullEnable = true;
2108 cl.FrontWinding = state->front_ccw ? 1 : 0;
2109 cl.CullMode = translate_cull_mode(state->cull_face);
2111 cl.UserClipDistanceClipTestEnableBitmask = state->clip_plane_enable;
2113 cl.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
2115 cl.APIMode = state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
2116 cl.GuardbandClipTestEnable = true;
2117 cl.ClipEnable = true;
2118 cl.MinimumPointWidth = 0.125;
2119 cl.MaximumPointWidth = 255.875;
2122 cl.ForceUserClipDistanceClipTestEnableBitmask = true;
2125 if (state->flatshade_first) {
2126 cl.TriangleFanProvokingVertexSelect = 1;
2128 cl.TriangleStripListProvokingVertexSelect = 2;
2129 cl.TriangleFanProvokingVertexSelect = 2;
2130 cl.LineStripListProvokingVertexSelect = 1;
2135 /* Remap from 0..255 back to 1..256 */
2136 const unsigned line_stipple_factor = state->line_stipple_factor + 1;
2138 crocus_pack_command(GENX(3DSTATE_LINE_STIPPLE), cso->line_stipple, line) {
2139 if (state->line_stipple_enable) {
2140 line.LineStipplePattern = state->line_stipple_pattern;
2141 line.LineStippleInverseRepeatCount = 1.0f / line_stipple_factor;
2142 line.LineStippleRepeatCount = line_stipple_factor;
2150 * The pipe->bind_rasterizer_state() driver hook.
2152 * Bind a rasterizer CSO and flag related dirty bits.
2155 crocus_bind_rasterizer_state(struct pipe_context *ctx, void *state)
2157 struct crocus_context *ice = (struct crocus_context *) ctx;
2158 struct crocus_rasterizer_state *old_cso = ice->state.cso_rast;
2159 struct crocus_rasterizer_state *new_cso = state;
2162 /* Try to avoid re-emitting 3DSTATE_LINE_STIPPLE, it's non-pipelined */
2163 if (cso_changed_memcmp(line_stipple))
2164 ice->state.dirty |= CROCUS_DIRTY_LINE_STIPPLE;
2166 if (cso_changed(cso.half_pixel_center))
2167 ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE;
2168 if (cso_changed(cso.scissor))
2169 ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
2170 if (cso_changed(cso.multisample))
2171 ice->state.dirty |= CROCUS_DIRTY_WM;
2173 if (cso_changed(cso.scissor))
2174 ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
2177 if (cso_changed(cso.line_stipple_enable) || cso_changed(cso.poly_stipple_enable))
2178 ice->state.dirty |= CROCUS_DIRTY_WM;
2181 if (cso_changed(cso.rasterizer_discard))
2182 ice->state.dirty |= CROCUS_DIRTY_STREAMOUT | CROCUS_DIRTY_CLIP;
2184 if (cso_changed(cso.flatshade_first))
2185 ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
2188 if (cso_changed(cso.depth_clip_near) || cso_changed(cso.depth_clip_far) ||
2189 cso_changed(cso.clip_halfz))
2190 ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
2193 if (cso_changed(cso.sprite_coord_enable) ||
2194 cso_changed(cso.sprite_coord_mode) ||
2195 cso_changed(cso.light_twoside))
2196 ice->state.dirty |= CROCUS_DIRTY_GEN7_SBE;
2199 if (cso_changed(cso.clip_plane_enable))
2200 ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
2204 ice->state.cso_rast = new_cso;
2205 ice->state.dirty |= CROCUS_DIRTY_RASTER;
2206 ice->state.dirty |= CROCUS_DIRTY_CLIP;
2208 ice->state.dirty |= CROCUS_DIRTY_GEN4_CLIP_PROG | CROCUS_DIRTY_GEN4_SF_PROG;
2209 ice->state.dirty |= CROCUS_DIRTY_WM;
2212 ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
2214 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_RASTERIZER];
2218 * Return true if the given wrap mode requires the border color to exist.
2220 * (We can skip uploading it if the sampler isn't going to use it.)
2223 wrap_mode_needs_border_color(unsigned wrap_mode)
2226 return wrap_mode == TCM_CLAMP_BORDER || wrap_mode == TCM_HALF_BORDER;
2228 return wrap_mode == TCM_CLAMP_BORDER;
2233 * Gallium CSO for sampler state.
2235 struct crocus_sampler_state {
2236 struct pipe_sampler_state pstate;
2237 union pipe_color_union border_color;
2238 bool needs_border_color;
2242 unsigned mag_img_filter;
2247 * The pipe->create_sampler_state() driver hook.
2249 * We fill out SAMPLER_STATE (except for the border color pointer), and
2250 * store that on the CPU. It doesn't make sense to upload it to a GPU
2251 * buffer object yet, because 3DSTATE_SAMPLER_STATE_POINTERS requires
2252 * all bound sampler states to be in contiguous memor.
2255 crocus_create_sampler_state(struct pipe_context *ctx,
2256 const struct pipe_sampler_state *state)
2258 struct crocus_sampler_state *cso = CALLOC_STRUCT(crocus_sampler_state);
2263 STATIC_ASSERT(PIPE_TEX_FILTER_NEAREST == MAPFILTER_NEAREST);
2264 STATIC_ASSERT(PIPE_TEX_FILTER_LINEAR == MAPFILTER_LINEAR);
2266 bool either_nearest = state->min_img_filter == PIPE_TEX_FILTER_NEAREST ||
2267 state->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
2268 cso->wrap_s = translate_wrap(state->wrap_s, either_nearest);
2269 cso->wrap_t = translate_wrap(state->wrap_t, either_nearest);
2270 cso->wrap_r = translate_wrap(state->wrap_r, either_nearest);
2272 cso->pstate = *state;
2274 memcpy(&cso->border_color, &state->border_color, sizeof(cso->border_color));
2276 cso->needs_border_color = wrap_mode_needs_border_color(cso->wrap_s) ||
2277 wrap_mode_needs_border_color(cso->wrap_t) ||
2278 wrap_mode_needs_border_color(cso->wrap_r);
2280 cso->min_lod = state->min_lod;
2281 cso->mag_img_filter = state->mag_img_filter;
2283 // XXX: explain this code ported from ilo...I don't get it at all...
2284 if (state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
2285 state->min_lod > 0.0f) {
2286 cso->min_lod = 0.0f;
2287 cso->mag_img_filter = state->min_img_filter;
2294 * The pipe->bind_sampler_states() driver hook.
2297 crocus_bind_sampler_states(struct pipe_context *ctx,
2298 enum pipe_shader_type p_stage,
2299 unsigned start, unsigned count,
2302 struct crocus_context *ice = (struct crocus_context *) ctx;
2303 gl_shader_stage stage = stage_from_pipe(p_stage);
2304 struct crocus_shader_state *shs = &ice->state.shaders[stage];
2306 assert(start + count <= CROCUS_MAX_TEXTURE_SAMPLERS);
2310 for (int i = 0; i < count; i++) {
2311 if (shs->samplers[start + i] != states[i]) {
2312 shs->samplers[start + i] = states[i];
2319 if (p_stage == PIPE_SHADER_FRAGMENT)
2320 ice->state.dirty |= CROCUS_DIRTY_WM;
2321 else if (p_stage == PIPE_SHADER_VERTEX)
2322 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS;
2324 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
2325 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES];
2329 enum samp_workaround {
2337 crocus_upload_sampler_state(struct crocus_batch *batch,
2338 struct crocus_sampler_state *cso,
2339 uint32_t border_color_offset,
2340 enum samp_workaround samp_workaround,
2341 uint32_t first_level,
2344 struct pipe_sampler_state *state = &cso->pstate;
2345 uint32_t wrap_s, wrap_t, wrap_r;
2347 wrap_s = cso->wrap_s;
2348 wrap_t = cso->wrap_t;
2349 wrap_r = cso->wrap_r;
2351 switch (samp_workaround) {
2352 case SAMP_CUBE_CLAMP:
2357 case SAMP_CUBE_CUBE:
2369 _crocus_pack_state(batch, GENX(SAMPLER_STATE), map, samp) {
2370 samp.TCXAddressControlMode = wrap_s;
2371 samp.TCYAddressControlMode = wrap_t;
2372 samp.TCZAddressControlMode = wrap_r;
2375 samp.NonnormalizedCoordinateEnable = state->unnormalized_coords;
2377 samp.MinModeFilter = state->min_img_filter;
2378 samp.MagModeFilter = cso->mag_img_filter;
2379 samp.MipModeFilter = translate_mip_filter(state->min_mip_filter);
2380 samp.MaximumAnisotropy = RATIO21;
2382 if (state->max_anisotropy >= 2) {
2383 if (state->min_img_filter == PIPE_TEX_FILTER_LINEAR) {
2384 samp.MinModeFilter = MAPFILTER_ANISOTROPIC;
2386 samp.AnisotropicAlgorithm = EWAApproximation;
2390 if (state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)
2391 samp.MagModeFilter = MAPFILTER_ANISOTROPIC;
2393 samp.MaximumAnisotropy =
2394 MIN2((state->max_anisotropy - 2) / 2, RATIO161);
2397 /* Set address rounding bits if not using nearest filtering. */
2398 if (state->min_img_filter != PIPE_TEX_FILTER_NEAREST) {
2399 samp.UAddressMinFilterRoundingEnable = true;
2400 samp.VAddressMinFilterRoundingEnable = true;
2401 samp.RAddressMinFilterRoundingEnable = true;
2404 if (state->mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
2405 samp.UAddressMagFilterRoundingEnable = true;
2406 samp.VAddressMagFilterRoundingEnable = true;
2407 samp.RAddressMagFilterRoundingEnable = true;
2410 if (state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE)
2411 samp.ShadowFunction = translate_shadow_func(state->compare_func);
2413 const float hw_max_lod = GFX_VER >= 7 ? 14 : 13;
2416 samp.LODPreClampMode = CLAMP_MODE_OGL;
2418 samp.LODPreClampEnable = true;
2420 samp.MinLOD = CLAMP(cso->min_lod, 0, hw_max_lod);
2421 samp.MaxLOD = CLAMP(state->max_lod, 0, hw_max_lod);
2422 samp.TextureLODBias = CLAMP(state->lod_bias, -16, 15);
2425 samp.BaseMipLevel = CLAMP(first_level, 0, hw_max_lod);
2426 samp.MinandMagStateNotEqual = samp.MinModeFilter != samp.MagModeFilter;
2430 samp.BorderColorPointer =
2431 ro_bo(batch->state.bo, border_color_offset);
2433 samp.BorderColorPointer = border_color_offset;
2439 crocus_upload_border_color(struct crocus_batch *batch,
2440 struct crocus_sampler_state *cso,
2441 struct crocus_sampler_view *tex,
2442 uint32_t *bc_offset)
2444 /* We may need to swizzle the border color for format faking.
2445 * A/LA formats are faked as R/RG with 000R or R00G swizzles.
2446 * This means we need to move the border color's A channel into
2447 * the R or G channels so that those read swizzles will move it
2450 enum pipe_format internal_format = PIPE_FORMAT_NONE;
2451 union pipe_color_union *color = &cso->border_color;
2452 union pipe_color_union tmp;
2454 internal_format = tex->res->internal_format;
2456 if (util_format_is_alpha(internal_format)) {
2457 unsigned char swz[4] = {
2458 PIPE_SWIZZLE_0, PIPE_SWIZZLE_0,
2459 PIPE_SWIZZLE_0, PIPE_SWIZZLE_W,
2461 util_format_apply_color_swizzle(&tmp, color, swz, true);
2463 } else if (util_format_is_luminance_alpha(internal_format) &&
2464 internal_format != PIPE_FORMAT_L8A8_SRGB) {
2465 unsigned char swz[4] = {
2466 PIPE_SWIZZLE_X, PIPE_SWIZZLE_X,
2467 PIPE_SWIZZLE_X, PIPE_SWIZZLE_W
2469 util_format_apply_color_swizzle(&tmp, color, swz, true);
2473 bool is_integer_format = util_format_is_pure_integer(internal_format);
2474 unsigned sbc_size = GENX(SAMPLER_BORDER_COLOR_STATE_length) * 4;
2475 const int sbc_align = (GFX_VER == 8 ? 64 : ((GFX_VERx10 == 75 && is_integer_format) ? 512 : 32));
2476 uint32_t *sbc = stream_state(batch, sbc_size, sbc_align, bc_offset);
2478 struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 };
2480 #define ASSIGN(dst, src) \
2485 #define ASSIGNu16(dst, src) \
2487 dst = (uint16_t)src; \
2490 #define ASSIGNu8(dst, src) \
2492 dst = (uint8_t)src; \
2495 #define BORDER_COLOR_ATTR(macro, _color_type, src) \
2496 macro(state.BorderColor ## _color_type ## Red, src[0]); \
2497 macro(state.BorderColor ## _color_type ## Green, src[1]); \
2498 macro(state.BorderColor ## _color_type ## Blue, src[2]); \
2499 macro(state.BorderColor ## _color_type ## Alpha, src[3]);
2502 /* On Broadwell, the border color is represented as four 32-bit floats,
2503 * integers, or unsigned values, interpreted according to the surface
2504 * format. This matches the sampler->BorderColor union exactly; just
2505 * memcpy the values.
2507 BORDER_COLOR_ATTR(ASSIGN, 32bit, color->ui);
2508 #elif GFX_VERx10 == 75
2509 if (is_integer_format) {
2510 const struct util_format_description *format_desc =
2511 util_format_description(internal_format);
2513 /* From the Haswell PRM, "Command Reference: Structures", Page 36:
2514 * "If any color channel is missing from the surface format,
2515 * corresponding border color should be programmed as zero and if
2516 * alpha channel is missing, corresponding Alpha border color should
2517 * be programmed as 1."
2519 unsigned c[4] = { 0, 0, 0, 1 };
2520 for (int i = 0; i < 4; i++) {
2521 if (format_desc->channel[i].size)
2522 c[i] = color->ui[i];
2525 switch (format_desc->channel[0].size) {
2527 /* Copy RGBA in order. */
2528 BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c);
2531 /* R10G10B10A2_UINT is treated like a 16-bit format. */
2533 BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c);
2536 if (format_desc->channel[1].size && !format_desc->channel[2].size) {
2537 /* Careful inspection of the tables reveals that for RG32 formats,
2538 * the green channel needs to go where blue normally belongs.
2540 state.BorderColor32bitRed = c[0];
2541 state.BorderColor32bitBlue = c[1];
2542 state.BorderColor32bitAlpha = 1;
2544 /* Copy RGBA in order. */
2545 BORDER_COLOR_ATTR(ASSIGN, 32bit, c);
2549 assert(!"Invalid number of bits per channel in integer format.");
2553 BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2555 #elif GFX_VER == 5 || GFX_VER == 6
2556 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color->f);
2557 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color->f);
2558 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color->f);
2560 #define MESA_FLOAT_TO_HALF(dst, src) \
2561 dst = _mesa_float_to_half(src);
2563 BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color->f);
2565 #undef MESA_FLOAT_TO_HALF
2567 state.BorderColorSnorm8Red = state.BorderColorSnorm16Red >> 8;
2568 state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8;
2569 state.BorderColorSnorm8Blue = state.BorderColorSnorm16Blue >> 8;
2570 state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8;
2572 BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2575 BORDER_COLOR_ATTR(ASSIGN, , color->f);
2577 BORDER_COLOR_ATTR(ASSIGN, Float, color->f);
2581 #undef BORDER_COLOR_ATTR
2583 GENX(SAMPLER_BORDER_COLOR_STATE_pack)(batch, sbc, &state);
2587 * Upload the sampler states into a contiguous area of GPU memory, for
2588 * for 3DSTATE_SAMPLER_STATE_POINTERS_*.
2590 * Also fill out the border color state pointers.
2593 crocus_upload_sampler_states(struct crocus_context *ice,
2594 struct crocus_batch *batch, gl_shader_stage stage)
2596 struct crocus_shader_state *shs = &ice->state.shaders[stage];
2597 const struct shader_info *info = crocus_get_shader_info(ice, stage);
2599 /* We assume the state tracker will call pipe->bind_sampler_states()
2600 * if the program's number of textures changes.
2602 unsigned count = info ? BITSET_LAST_BIT(info->textures_used) : 0;
2607 /* Assemble the SAMPLER_STATEs into a contiguous table that lives
2608 * in the dynamic state memory zone, so we can point to it via the
2609 * 3DSTATE_SAMPLER_STATE_POINTERS_* commands.
2611 unsigned size = count * 4 * GENX(SAMPLER_STATE_length);
2612 uint32_t *map = stream_state(batch, size, 32, &shs->sampler_offset);
2617 for (int i = 0; i < count; i++) {
2618 struct crocus_sampler_state *state = shs->samplers[i];
2619 struct crocus_sampler_view *tex = shs->textures[i];
2621 if (!state || !tex) {
2622 memset(map, 0, 4 * GENX(SAMPLER_STATE_length));
2624 unsigned border_color_offset = 0;
2625 if (state->needs_border_color) {
2626 crocus_upload_border_color(batch, state, tex, &border_color_offset);
2629 enum samp_workaround wa = SAMP_NORMAL;
2630 /* There's a bug in 1D texture sampling - it actually pays
2631 * attention to the wrap_t value, though it should not.
2632 * Override the wrap_t value here to GL_REPEAT to keep
2633 * any nonexistent border pixels from floating in.
2635 if (tex->base.target == PIPE_TEXTURE_1D)
2637 else if (tex->base.target == PIPE_TEXTURE_CUBE ||
2638 tex->base.target == PIPE_TEXTURE_CUBE_ARRAY) {
2639 /* Cube maps must use the same wrap mode for all three coordinate
2640 * dimensions. Prior to Haswell, only CUBE and CLAMP are valid.
2642 * Ivybridge and Baytrail seem to have problems with CUBE mode and
2643 * integer formats. Fall back to CLAMP for now.
2645 if (state->pstate.seamless_cube_map &&
2646 !(GFX_VERx10 == 70 && util_format_is_pure_integer(tex->base.format)))
2647 wa = SAMP_CUBE_CUBE;
2649 wa = SAMP_CUBE_CLAMP;
2652 uint32_t first_level = 0;
2653 if (tex->base.target != PIPE_BUFFER)
2654 first_level = tex->base.u.tex.first_level;
2656 crocus_upload_sampler_state(batch, state, border_color_offset, wa, first_level, map);
2659 map += GENX(SAMPLER_STATE_length);
2664 * The pipe->create_sampler_view() driver hook.
2666 static struct pipe_sampler_view *
2667 crocus_create_sampler_view(struct pipe_context *ctx,
2668 struct pipe_resource *tex,
2669 const struct pipe_sampler_view *tmpl)
2671 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2672 const struct intel_device_info *devinfo = &screen->devinfo;
2673 struct crocus_sampler_view *isv = calloc(1, sizeof(struct crocus_sampler_view));
2678 /* initialize base object */
2680 isv->base.context = ctx;
2681 isv->base.texture = NULL;
2682 pipe_reference_init(&isv->base.reference, 1);
2683 pipe_resource_reference(&isv->base.texture, tex);
2685 if (util_format_is_depth_or_stencil(tmpl->format)) {
2686 struct crocus_resource *zres, *sres;
2687 const struct util_format_description *desc =
2688 util_format_description(tmpl->format);
2690 crocus_get_depth_stencil_resources(devinfo, tex, &zres, &sres);
2692 tex = util_format_has_depth(desc) ? &zres->base.b : &sres->base.b;
2694 if (tex->format == PIPE_FORMAT_S8_UINT)
2695 if (GFX_VER == 7 && sres->shadow)
2696 tex = &sres->shadow->base.b;
2699 isv->res = (struct crocus_resource *) tex;
2701 isl_surf_usage_flags_t usage = ISL_SURF_USAGE_TEXTURE_BIT;
2703 if (isv->base.target == PIPE_TEXTURE_CUBE ||
2704 isv->base.target == PIPE_TEXTURE_CUBE_ARRAY)
2705 usage |= ISL_SURF_USAGE_CUBE_BIT;
2707 const struct crocus_format_info fmt =
2708 crocus_format_for_usage(devinfo, tmpl->format, usage);
2710 enum pipe_swizzle vswz[4] = { tmpl->swizzle_r, tmpl->swizzle_g, tmpl->swizzle_b, tmpl->swizzle_a };
2711 crocus_combine_swizzle(isv->swizzle, fmt.swizzles, vswz);
2713 /* hardcode stencil swizzles - hw returns 0G01, we want GGGG */
2715 (tmpl->format == PIPE_FORMAT_X32_S8X24_UINT ||
2716 tmpl->format == PIPE_FORMAT_X24S8_UINT)) {
2717 isv->swizzle[0] = tmpl->swizzle_g;
2718 isv->swizzle[1] = tmpl->swizzle_g;
2719 isv->swizzle[2] = tmpl->swizzle_g;
2720 isv->swizzle[3] = tmpl->swizzle_g;
2723 isv->clear_color = isv->res->aux.clear_color;
2725 isv->view = (struct isl_view) {
2727 #if GFX_VERx10 >= 75
2728 .swizzle = (struct isl_swizzle) {
2729 .r = pipe_to_isl_swizzle(isv->swizzle[0], false),
2730 .g = pipe_to_isl_swizzle(isv->swizzle[1], false),
2731 .b = pipe_to_isl_swizzle(isv->swizzle[2], false),
2732 .a = pipe_to_isl_swizzle(isv->swizzle[3], false),
2735 /* swizzling handled in shader code */
2736 .swizzle = ISL_SWIZZLE_IDENTITY,
2741 /* Fill out SURFACE_STATE for this view. */
2742 if (tmpl->target != PIPE_BUFFER) {
2743 isv->view.base_level = tmpl->u.tex.first_level;
2744 isv->view.levels = tmpl->u.tex.last_level - tmpl->u.tex.first_level + 1;
2746 /* Hardware older than skylake ignores this value */
2747 assert(tex->target != PIPE_TEXTURE_3D || !tmpl->u.tex.first_layer);
2749 // XXX: do I need to port f9fd0cf4790cb2a530e75d1a2206dbb9d8af7cb2?
2750 isv->view.base_array_layer = tmpl->u.tex.first_layer;
2751 isv->view.array_len =
2752 tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
2755 /* just create a second view struct for texture gather just in case */
2756 isv->gather_view = isv->view;
2759 if (fmt.fmt == ISL_FORMAT_R32G32_FLOAT ||
2760 fmt.fmt == ISL_FORMAT_R32G32_SINT ||
2761 fmt.fmt == ISL_FORMAT_R32G32_UINT) {
2762 isv->gather_view.format = ISL_FORMAT_R32G32_FLOAT_LD;
2763 #if GFX_VERx10 >= 75
2764 isv->gather_view.swizzle = (struct isl_swizzle) {
2765 .r = pipe_to_isl_swizzle(isv->swizzle[0], GFX_VERx10 == 75),
2766 .g = pipe_to_isl_swizzle(isv->swizzle[1], GFX_VERx10 == 75),
2767 .b = pipe_to_isl_swizzle(isv->swizzle[2], GFX_VERx10 == 75),
2768 .a = pipe_to_isl_swizzle(isv->swizzle[3], GFX_VERx10 == 75),
2774 /* Sandybridge's gather4 message is broken for integer formats.
2775 * To work around this, we pretend the surface is UNORM for
2776 * 8 or 16-bit formats, and emit shader instructions to recover
2777 * the real INT/UINT value. For 32-bit formats, we pretend
2778 * the surface is FLOAT, and simply reinterpret the resulting
2782 case ISL_FORMAT_R8_SINT:
2783 case ISL_FORMAT_R8_UINT:
2784 isv->gather_view.format = ISL_FORMAT_R8_UNORM;
2787 case ISL_FORMAT_R16_SINT:
2788 case ISL_FORMAT_R16_UINT:
2789 isv->gather_view.format = ISL_FORMAT_R16_UNORM;
2792 case ISL_FORMAT_R32_SINT:
2793 case ISL_FORMAT_R32_UINT:
2794 isv->gather_view.format = ISL_FORMAT_R32_FLOAT;
2802 /* Fill out SURFACE_STATE for this view. */
2803 if (tmpl->target != PIPE_BUFFER) {
2804 if (crocus_resource_unfinished_aux_import(isv->res))
2805 crocus_resource_finish_aux_import(&screen->base, isv->res);
2813 crocus_sampler_view_destroy(struct pipe_context *ctx,
2814 struct pipe_sampler_view *state)
2816 struct crocus_sampler_view *isv = (void *) state;
2817 pipe_resource_reference(&state->texture, NULL);
2822 * The pipe->create_surface() driver hook.
2824 * In Gallium nomenclature, "surfaces" are a view of a resource that
2825 * can be bound as a render target or depth/stencil buffer.
2827 static struct pipe_surface *
2828 crocus_create_surface(struct pipe_context *ctx,
2829 struct pipe_resource *tex,
2830 const struct pipe_surface *tmpl)
2832 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
2833 const struct intel_device_info *devinfo = &screen->devinfo;
2835 isl_surf_usage_flags_t usage = 0;
2837 usage = ISL_SURF_USAGE_STORAGE_BIT;
2838 else if (util_format_is_depth_or_stencil(tmpl->format))
2839 usage = ISL_SURF_USAGE_DEPTH_BIT;
2841 usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;
2843 const struct crocus_format_info fmt =
2844 crocus_format_for_usage(devinfo, tmpl->format, usage);
2846 if ((usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) &&
2847 !isl_format_supports_rendering(devinfo, fmt.fmt)) {
2848 /* Framebuffer validation will reject this invalid case, but it
2849 * hasn't had the opportunity yet. In the meantime, we need to
2850 * avoid hitting ISL asserts about unsupported formats below.
2855 struct crocus_surface *surf = calloc(1, sizeof(struct crocus_surface));
2856 struct pipe_surface *psurf = &surf->base;
2857 struct crocus_resource *res = (struct crocus_resource *) tex;
2862 pipe_reference_init(&psurf->reference, 1);
2863 pipe_resource_reference(&psurf->texture, tex);
2864 psurf->context = ctx;
2865 psurf->format = tmpl->format;
2866 psurf->width = tex->width0;
2867 psurf->height = tex->height0;
2868 psurf->texture = tex;
2869 psurf->u.tex.first_layer = tmpl->u.tex.first_layer;
2870 psurf->u.tex.last_layer = tmpl->u.tex.last_layer;
2871 psurf->u.tex.level = tmpl->u.tex.level;
2873 uint32_t array_len = tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
2875 struct isl_view *view = &surf->view;
2876 *view = (struct isl_view) {
2878 .base_level = tmpl->u.tex.level,
2880 .base_array_layer = tmpl->u.tex.first_layer,
2881 .array_len = array_len,
2882 .swizzle = ISL_SWIZZLE_IDENTITY,
2887 struct isl_view *read_view = &surf->read_view;
2888 *read_view = (struct isl_view) {
2890 .base_level = tmpl->u.tex.level,
2892 .base_array_layer = tmpl->u.tex.first_layer,
2893 .array_len = array_len,
2894 .swizzle = ISL_SWIZZLE_IDENTITY,
2895 .usage = ISL_SURF_USAGE_TEXTURE_BIT,
2899 surf->clear_color = res->aux.clear_color;
2901 /* Bail early for depth/stencil - we don't want SURFACE_STATE for them. */
2902 if (res->surf.usage & (ISL_SURF_USAGE_DEPTH_BIT |
2903 ISL_SURF_USAGE_STENCIL_BIT))
2906 if (!isl_format_is_compressed(res->surf.format)) {
2907 if (crocus_resource_unfinished_aux_import(res))
2908 crocus_resource_finish_aux_import(&screen->base, res);
2910 memcpy(&surf->surf, &res->surf, sizeof(surf->surf));
2911 uint64_t temp_offset;
2912 uint32_t temp_x, temp_y;
2914 isl_surf_get_image_offset_B_tile_sa(&res->surf, tmpl->u.tex.level,
2915 res->base.b.target == PIPE_TEXTURE_3D ? 0 : tmpl->u.tex.first_layer,
2916 res->base.b.target == PIPE_TEXTURE_3D ? tmpl->u.tex.first_layer : 0,
2917 &temp_offset, &temp_x, &temp_y);
2918 if (!devinfo->has_surface_tile_offset &&
2919 (temp_x || temp_y)) {
2920 /* Original gfx4 hardware couldn't draw to a non-tile-aligned
2924 struct pipe_resource wa_templ = (struct pipe_resource) {
2925 .width0 = u_minify(res->base.b.width0, tmpl->u.tex.level),
2926 .height0 = u_minify(res->base.b.height0, tmpl->u.tex.level),
2929 .format = res->base.b.format,
2930 .target = PIPE_TEXTURE_2D,
2931 .bind = (usage & ISL_SURF_USAGE_DEPTH_BIT ? PIPE_BIND_DEPTH_STENCIL : PIPE_BIND_RENDER_TARGET) | PIPE_BIND_SAMPLER_VIEW,
2933 surf->align_res = screen->base.resource_create(&screen->base, &wa_templ);
2934 view->base_level = 0;
2935 view->base_array_layer = 0;
2936 view->array_len = 1;
2937 struct crocus_resource *align_res = (struct crocus_resource *)surf->align_res;
2938 memcpy(&surf->surf, &align_res->surf, sizeof(surf->surf));
2943 /* The resource has a compressed format, which is not renderable, but we
2944 * have a renderable view format. We must be attempting to upload blocks
2945 * of compressed data via an uncompressed view.
2947 * In this case, we can assume there are no auxiliary buffers, a single
2948 * miplevel, and that the resource is single-sampled. Gallium may try
2949 * and create an uncompressed view with multiple layers, however.
2951 assert(!isl_format_is_compressed(fmt.fmt));
2952 assert(res->surf.samples == 1);
2953 assert(view->levels == 1);
2955 /* TODO: compressed pbo uploads aren't working here */
2958 uint64_t offset_B = 0;
2959 uint32_t tile_x_sa = 0, tile_y_sa = 0;
2961 if (view->base_level > 0) {
2962 /* We can't rely on the hardware's miplevel selection with such
2963 * a substantial lie about the format, so we select a single image
2964 * using the Tile X/Y Offset fields. In this case, we can't handle
2965 * multiple array slices.
2967 * On Broadwell, HALIGN and VALIGN are specified in pixels and are
2968 * hard-coded to align to exactly the block size of the compressed
2969 * texture. This means that, when reinterpreted as a non-compressed
2970 * texture, the tile offsets may be anything and we can't rely on
2973 * Return NULL to force the state tracker to take fallback paths.
2975 // TODO: check if the gen7 check is right, originally gen8
2976 if (view->array_len > 1 || GFX_VER == 7)
2979 const bool is_3d = res->surf.dim == ISL_SURF_DIM_3D;
2980 isl_surf_get_image_surf(&screen->isl_dev, &res->surf,
2982 is_3d ? 0 : view->base_array_layer,
2983 is_3d ? view->base_array_layer : 0,
2985 &offset_B, &tile_x_sa, &tile_y_sa);
2987 /* We use address and tile offsets to access a single level/layer
2988 * as a subimage, so reset level/layer so it doesn't offset again.
2990 view->base_array_layer = 0;
2991 view->base_level = 0;
2993 /* Level 0 doesn't require tile offsets, and the hardware can find
2994 * array slices using QPitch even with the format override, so we
2995 * can allow layers in this case. Copy the original ISL surface.
2997 memcpy(&surf->surf, &res->surf, sizeof(surf->surf));
3000 /* Scale down the image dimensions by the block size. */
3001 const struct isl_format_layout *fmtl =
3002 isl_format_get_layout(res->surf.format);
3003 surf->surf.format = fmt.fmt;
3004 surf->surf.logical_level0_px = isl_surf_get_logical_level0_el(&surf->surf);
3005 surf->surf.phys_level0_sa = isl_surf_get_phys_level0_el(&surf->surf);
3006 tile_x_sa /= fmtl->bw;
3007 tile_y_sa /= fmtl->bh;
3009 psurf->width = surf->surf.logical_level0_px.width;
3010 psurf->height = surf->surf.logical_level0_px.height;
3017 fill_default_image_param(struct brw_image_param *param)
3019 memset(param, 0, sizeof(*param));
3020 /* Set the swizzling shifts to all-ones to effectively disable swizzling --
3021 * See emit_address_calculation() in brw_fs_surface_builder.cpp for a more
3022 * detailed explanation of these parameters.
3024 param->swizzling[0] = 0xff;
3025 param->swizzling[1] = 0xff;
3029 fill_buffer_image_param(struct brw_image_param *param,
3030 enum pipe_format pfmt,
3033 const unsigned cpp = util_format_get_blocksize(pfmt);
3035 fill_default_image_param(param);
3036 param->size[0] = size / cpp;
3037 param->stride[0] = cpp;
3043 * The pipe->set_shader_images() driver hook.
3046 crocus_set_shader_images(struct pipe_context *ctx,
3047 enum pipe_shader_type p_stage,
3048 unsigned start_slot, unsigned count,
3049 unsigned unbind_num_trailing_slots,
3050 const struct pipe_image_view *p_images)
3053 struct crocus_context *ice = (struct crocus_context *) ctx;
3054 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3055 const struct intel_device_info *devinfo = &screen->devinfo;
3056 gl_shader_stage stage = stage_from_pipe(p_stage);
3057 struct crocus_shader_state *shs = &ice->state.shaders[stage];
3058 struct crocus_genx_state *genx = ice->state.genx;
3059 struct brw_image_param *image_params = genx->shaders[stage].image_param;
3061 shs->bound_image_views &= ~u_bit_consecutive(start_slot, count);
3063 for (unsigned i = 0; i < count; i++) {
3064 struct crocus_image_view *iv = &shs->image[start_slot + i];
3066 if (p_images && p_images[i].resource) {
3067 const struct pipe_image_view *img = &p_images[i];
3068 struct crocus_resource *res = (void *) img->resource;
3070 util_copy_image_view(&iv->base, img);
3072 shs->bound_image_views |= 1 << (start_slot + i);
3074 res->bind_history |= PIPE_BIND_SHADER_IMAGE;
3075 res->bind_stages |= 1 << stage;
3077 isl_surf_usage_flags_t usage = ISL_SURF_USAGE_STORAGE_BIT;
3078 struct crocus_format_info fmt =
3079 crocus_format_for_usage(devinfo, img->format, usage);
3081 struct isl_swizzle swiz = pipe_to_isl_swizzles(fmt.swizzles);
3082 if (img->shader_access & PIPE_IMAGE_ACCESS_READ) {
3083 /* On Gen8, try to use typed surfaces reads (which support a
3084 * limited number of formats), and if not possible, fall back
3087 if (!isl_has_matching_typed_storage_image_format(devinfo, fmt.fmt))
3088 fmt.fmt = ISL_FORMAT_RAW;
3090 fmt.fmt = isl_lower_storage_image_format(devinfo, fmt.fmt);
3093 if (res->base.b.target != PIPE_BUFFER) {
3094 struct isl_view view = {
3096 .base_level = img->u.tex.level,
3098 .base_array_layer = img->u.tex.first_layer,
3099 .array_len = img->u.tex.last_layer - img->u.tex.first_layer + 1,
3106 isl_surf_fill_image_param(&screen->isl_dev,
3107 &image_params[start_slot + i],
3110 struct isl_view view = {
3117 util_range_add(&res->base.b, &res->valid_buffer_range, img->u.buf.offset,
3118 img->u.buf.offset + img->u.buf.size);
3119 fill_buffer_image_param(&image_params[start_slot + i],
3120 img->format, img->u.buf.size);
3123 pipe_resource_reference(&iv->base.resource, NULL);
3124 fill_default_image_param(&image_params[start_slot + i]);
3128 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
3130 stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3131 : CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3133 /* Broadwell also needs brw_image_params re-uploaded */
3134 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage;
3135 shs->sysvals_need_upload = true;
3141 * The pipe->set_sampler_views() driver hook.
3144 crocus_set_sampler_views(struct pipe_context *ctx,
3145 enum pipe_shader_type p_stage,
3146 unsigned start, unsigned count,
3147 unsigned unbind_num_trailing_slots,
3148 bool take_ownership,
3149 struct pipe_sampler_view **views)
3151 struct crocus_context *ice = (struct crocus_context *) ctx;
3152 gl_shader_stage stage = stage_from_pipe(p_stage);
3153 struct crocus_shader_state *shs = &ice->state.shaders[stage];
3155 shs->bound_sampler_views &= ~u_bit_consecutive(start, count);
3157 for (unsigned i = 0; i < count; i++) {
3158 struct pipe_sampler_view *pview = views ? views[i] : NULL;
3160 if (take_ownership) {
3161 pipe_sampler_view_reference((struct pipe_sampler_view **)
3162 &shs->textures[start + i], NULL);
3163 shs->textures[start + i] = (struct crocus_sampler_view *)pview;
3165 pipe_sampler_view_reference((struct pipe_sampler_view **)
3166 &shs->textures[start + i], pview);
3169 struct crocus_sampler_view *view = (void *) pview;
3171 view->res->bind_history |= PIPE_BIND_SAMPLER_VIEW;
3172 view->res->bind_stages |= 1 << stage;
3174 shs->bound_sampler_views |= 1 << (start + i);
3178 /* first level parameters to crocus_upload_sampler_state is gfx6 only */
3179 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
3181 ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage);
3183 stage == MESA_SHADER_COMPUTE ? CROCUS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
3184 : CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3185 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_TEXTURES];
3189 * The pipe->set_tess_state() driver hook.
3192 crocus_set_tess_state(struct pipe_context *ctx,
3193 const float default_outer_level[4],
3194 const float default_inner_level[2])
3196 struct crocus_context *ice = (struct crocus_context *) ctx;
3197 struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_CTRL];
3199 memcpy(&ice->state.default_outer_level[0], &default_outer_level[0], 4 * sizeof(float));
3200 memcpy(&ice->state.default_inner_level[0], &default_inner_level[0], 2 * sizeof(float));
3202 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
3203 shs->sysvals_need_upload = true;
3207 crocus_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices)
3209 struct crocus_context *ice = (struct crocus_context *) ctx;
3211 ice->state.patch_vertices = patch_vertices;
3215 crocus_surface_destroy(struct pipe_context *ctx, struct pipe_surface *p_surf)
3217 struct crocus_surface *surf = (void *) p_surf;
3218 pipe_resource_reference(&p_surf->texture, NULL);
3220 pipe_resource_reference(&surf->align_res, NULL);
3225 crocus_set_clip_state(struct pipe_context *ctx,
3226 const struct pipe_clip_state *state)
3228 struct crocus_context *ice = (struct crocus_context *) ctx;
3229 struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX];
3230 struct crocus_shader_state *gshs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
3231 struct crocus_shader_state *tshs = &ice->state.shaders[MESA_SHADER_TESS_EVAL];
3233 memcpy(&ice->state.clip_planes, state, sizeof(*state));
3236 ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
3238 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS | CROCUS_STAGE_DIRTY_CONSTANTS_GS |
3239 CROCUS_STAGE_DIRTY_CONSTANTS_TES;
3240 shs->sysvals_need_upload = true;
3241 gshs->sysvals_need_upload = true;
3242 tshs->sysvals_need_upload = true;
3246 * The pipe->set_polygon_stipple() driver hook.
3249 crocus_set_polygon_stipple(struct pipe_context *ctx,
3250 const struct pipe_poly_stipple *state)
3252 struct crocus_context *ice = (struct crocus_context *) ctx;
3253 memcpy(&ice->state.poly_stipple, state, sizeof(*state));
3254 ice->state.dirty |= CROCUS_DIRTY_POLYGON_STIPPLE;
3258 * The pipe->set_sample_mask() driver hook.
3261 crocus_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
3263 struct crocus_context *ice = (struct crocus_context *) ctx;
3265 /* We only support 16x MSAA, so we have 16 bits of sample maks.
3266 * st/mesa may pass us 0xffffffff though, meaning "enable all samples".
3268 ice->state.sample_mask = sample_mask & 0xff;
3269 ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK;
3273 crocus_fill_scissor_rect(struct crocus_context *ice,
3275 struct pipe_scissor_state *ss)
3277 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
3278 struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
3279 const struct pipe_viewport_state *vp = &ice->state.viewports[idx];
3280 struct pipe_scissor_state scissor = (struct pipe_scissor_state) {
3281 .minx = MAX2(-fabsf(vp->scale[0]) + vp->translate[0], 0),
3282 .maxx = MIN2( fabsf(vp->scale[0]) + vp->translate[0], cso_fb->width) - 1,
3283 .miny = MAX2(-fabsf(vp->scale[1]) + vp->translate[1], 0),
3284 .maxy = MIN2( fabsf(vp->scale[1]) + vp->translate[1], cso_fb->height) - 1,
3286 if (cso_state->scissor) {
3287 struct pipe_scissor_state *s = &ice->state.scissors[idx];
3288 scissor.minx = MAX2(scissor.minx, s->minx);
3289 scissor.miny = MAX2(scissor.miny, s->miny);
3290 scissor.maxx = MIN2(scissor.maxx, s->maxx);
3291 scissor.maxy = MIN2(scissor.maxy, s->maxy);
3297 * The pipe->set_scissor_states() driver hook.
3299 * This corresponds to our SCISSOR_RECT state structures. It's an
3300 * exact match, so we just store them, and memcpy them out later.
3303 crocus_set_scissor_states(struct pipe_context *ctx,
3304 unsigned start_slot,
3305 unsigned num_scissors,
3306 const struct pipe_scissor_state *rects)
3308 struct crocus_context *ice = (struct crocus_context *) ctx;
3310 for (unsigned i = 0; i < num_scissors; i++) {
3311 if (rects[i].minx == rects[i].maxx || rects[i].miny == rects[i].maxy) {
3312 /* If the scissor was out of bounds and got clamped to 0 width/height
3313 * at the bounds, the subtraction of 1 from maximums could produce a
3314 * negative number and thus not clip anything. Instead, just provide
3315 * a min > max scissor inside the bounds, which produces the expected
3318 ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3319 .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
3322 ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3323 .minx = rects[i].minx, .miny = rects[i].miny,
3324 .maxx = rects[i].maxx - 1, .maxy = rects[i].maxy - 1,
3330 ice->state.dirty |= CROCUS_DIRTY_RASTER; /* SF state */
3332 ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3334 ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3339 * The pipe->set_stencil_ref() driver hook.
3341 * This is added to 3DSTATE_WM_DEPTH_STENCIL dynamically at draw time.
3344 crocus_set_stencil_ref(struct pipe_context *ctx,
3345 const struct pipe_stencil_ref ref)
3347 struct crocus_context *ice = (struct crocus_context *) ctx;
3348 ice->state.stencil_ref = ref;
3349 ice->state.dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
3354 viewport_extent(const struct pipe_viewport_state *state, int axis, float sign)
3356 return copysignf(state->scale[axis], sign) + state->translate[axis];
3361 * The pipe->set_viewport_states() driver hook.
3363 * This corresponds to our SF_CLIP_VIEWPORT states. We can't calculate
3364 * the guardband yet, as we need the framebuffer dimensions, but we can
3365 * at least fill out the rest.
3368 crocus_set_viewport_states(struct pipe_context *ctx,
3369 unsigned start_slot,
3371 const struct pipe_viewport_state *states)
3373 struct crocus_context *ice = (struct crocus_context *) ctx;
3374 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3376 memcpy(&ice->state.viewports[start_slot], states, sizeof(*states) * count);
3378 /* Fix depth test misrenderings by lowering translated depth range */
3379 if (screen->driconf.lower_depth_range_rate != 1.0f)
3380 ice->state.viewports[start_slot].translate[2] *=
3381 screen->driconf.lower_depth_range_rate;
3383 ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3384 ice->state.dirty |= CROCUS_DIRTY_RASTER;
3386 ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3389 if (ice->state.cso_rast && (!ice->state.cso_rast->cso.depth_clip_near ||
3390 !ice->state.cso_rast->cso.depth_clip_far))
3391 ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT;
3395 * The pipe->set_framebuffer_state() driver hook.
3397 * Sets the current draw FBO, including color render targets, depth,
3398 * and stencil buffers.
3401 crocus_set_framebuffer_state(struct pipe_context *ctx,
3402 const struct pipe_framebuffer_state *state)
3404 struct crocus_context *ice = (struct crocus_context *) ctx;
3405 struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
3406 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3407 const struct intel_device_info *devinfo = &screen->devinfo;
3409 struct isl_device *isl_dev = &screen->isl_dev;
3410 struct crocus_resource *zres;
3411 struct crocus_resource *stencil_res;
3414 unsigned samples = util_framebuffer_get_num_samples(state);
3415 unsigned layers = util_framebuffer_get_num_layers(state);
3418 if (cso->samples != samples) {
3419 ice->state.dirty |= CROCUS_DIRTY_GEN6_MULTISAMPLE;
3420 ice->state.dirty |= CROCUS_DIRTY_GEN6_SAMPLE_MASK;
3421 ice->state.dirty |= CROCUS_DIRTY_RASTER;
3422 #if GFX_VERx10 == 75
3423 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_FS;
3428 #if GFX_VER >= 6 && GFX_VER < 8
3429 ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
3432 if ((cso->layers == 0) != (layers == 0)) {
3433 ice->state.dirty |= CROCUS_DIRTY_CLIP;
3436 if (cso->width != state->width || cso->height != state->height) {
3437 ice->state.dirty |= CROCUS_DIRTY_SF_CL_VIEWPORT;
3438 ice->state.dirty |= CROCUS_DIRTY_RASTER;
3439 ice->state.dirty |= CROCUS_DIRTY_DRAWING_RECTANGLE;
3441 ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
3445 if (cso->zsbuf || state->zsbuf) {
3446 ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER;
3448 /* update SF's depth buffer format */
3449 if (GFX_VER == 7 && cso->zsbuf)
3450 ice->state.dirty |= CROCUS_DIRTY_RASTER;
3453 /* wm thread dispatch enable */
3454 ice->state.dirty |= CROCUS_DIRTY_WM;
3455 util_copy_framebuffer_state(cso, state);
3456 cso->samples = samples;
3457 cso->layers = layers;
3460 struct crocus_resource *zres;
3461 struct crocus_resource *stencil_res;
3462 enum isl_aux_usage aux_usage = ISL_AUX_USAGE_NONE;
3463 crocus_get_depth_stencil_resources(devinfo, cso->zsbuf->texture, &zres,
3465 if (zres && crocus_resource_level_has_hiz(zres, cso->zsbuf->u.tex.level)) {
3466 aux_usage = zres->aux.usage;
3468 ice->state.hiz_usage = aux_usage;
3471 /* Render target change */
3472 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_FS;
3474 ice->state.dirty |= CROCUS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3476 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_FRAMEBUFFER];
3480 * The pipe->set_constant_buffer() driver hook.
3482 * This uploads any constant data in user buffers, and references
3483 * any UBO resources containing constant data.
3486 crocus_set_constant_buffer(struct pipe_context *ctx,
3487 enum pipe_shader_type p_stage, unsigned index,
3488 bool take_ownership,
3489 const struct pipe_constant_buffer *input)
3491 struct crocus_context *ice = (struct crocus_context *) ctx;
3492 gl_shader_stage stage = stage_from_pipe(p_stage);
3493 struct crocus_shader_state *shs = &ice->state.shaders[stage];
3494 struct pipe_constant_buffer *cbuf = &shs->constbufs[index];
3496 util_copy_constant_buffer(&shs->constbufs[index], input, take_ownership);
3498 if (input && input->buffer_size && (input->buffer || input->user_buffer)) {
3499 shs->bound_cbufs |= 1u << index;
3501 if (input->user_buffer) {
3503 pipe_resource_reference(&cbuf->buffer, NULL);
3504 u_upload_alloc(ice->ctx.const_uploader, 0, input->buffer_size, 64,
3505 &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
3507 if (!cbuf->buffer) {
3508 /* Allocation was unsuccessful - just unbind */
3509 crocus_set_constant_buffer(ctx, p_stage, index, false, NULL);
3514 memcpy(map, input->user_buffer, input->buffer_size);
3517 MIN2(input->buffer_size,
3518 crocus_resource_bo(cbuf->buffer)->size - cbuf->buffer_offset);
3520 struct crocus_resource *res = (void *) cbuf->buffer;
3521 res->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
3522 res->bind_stages |= 1 << stage;
3524 shs->bound_cbufs &= ~(1u << index);
3527 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage;
3531 upload_sysvals(struct crocus_context *ice,
3532 gl_shader_stage stage)
3534 UNUSED struct crocus_genx_state *genx = ice->state.genx;
3535 struct crocus_shader_state *shs = &ice->state.shaders[stage];
3537 struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
3538 if (!shader || shader->num_system_values == 0)
3541 assert(shader->num_cbufs > 0);
3543 unsigned sysval_cbuf_index = shader->num_cbufs - 1;
3544 struct pipe_constant_buffer *cbuf = &shs->constbufs[sysval_cbuf_index];
3545 unsigned upload_size = shader->num_system_values * sizeof(uint32_t);
3546 uint32_t *map = NULL;
3548 assert(sysval_cbuf_index < PIPE_MAX_CONSTANT_BUFFERS);
3549 u_upload_alloc(ice->ctx.const_uploader, 0, upload_size, 64,
3550 &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
3552 for (int i = 0; i < shader->num_system_values; i++) {
3553 uint32_t sysval = shader->system_values[i];
3556 if (BRW_PARAM_DOMAIN(sysval) == BRW_PARAM_DOMAIN_IMAGE) {
3558 unsigned img = BRW_PARAM_IMAGE_IDX(sysval);
3559 unsigned offset = BRW_PARAM_IMAGE_OFFSET(sysval);
3560 struct brw_image_param *param =
3561 &genx->shaders[stage].image_param[img];
3563 assert(offset < sizeof(struct brw_image_param));
3564 value = ((uint32_t *) param)[offset];
3566 } else if (sysval == BRW_PARAM_BUILTIN_ZERO) {
3568 } else if (BRW_PARAM_BUILTIN_IS_CLIP_PLANE(sysval)) {
3569 int plane = BRW_PARAM_BUILTIN_CLIP_PLANE_IDX(sysval);
3570 int comp = BRW_PARAM_BUILTIN_CLIP_PLANE_COMP(sysval);
3571 value = fui(ice->state.clip_planes.ucp[plane][comp]);
3572 } else if (sysval == BRW_PARAM_BUILTIN_PATCH_VERTICES_IN) {
3573 if (stage == MESA_SHADER_TESS_CTRL) {
3574 value = ice->state.vertices_per_patch;
3576 assert(stage == MESA_SHADER_TESS_EVAL);
3577 const struct shader_info *tcs_info =
3578 crocus_get_shader_info(ice, MESA_SHADER_TESS_CTRL);
3580 value = tcs_info->tess.tcs_vertices_out;
3582 value = ice->state.vertices_per_patch;
3584 } else if (sysval >= BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X &&
3585 sysval <= BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_W) {
3586 unsigned i = sysval - BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X;
3587 value = fui(ice->state.default_outer_level[i]);
3588 } else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X) {
3589 value = fui(ice->state.default_inner_level[0]);
3590 } else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y) {
3591 value = fui(ice->state.default_inner_level[1]);
3592 } else if (sysval >= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X &&
3593 sysval <= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z) {
3594 unsigned i = sysval - BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X;
3595 value = ice->state.last_block[i];
3597 assert(!"unhandled system value");
3603 cbuf->buffer_size = upload_size;
3604 shs->sysvals_need_upload = false;
3608 * The pipe->set_shader_buffers() driver hook.
3610 * This binds SSBOs and ABOs. Unfortunately, we need to stream out
3611 * SURFACE_STATE here, as the buffer offset may change each time.
3614 crocus_set_shader_buffers(struct pipe_context *ctx,
3615 enum pipe_shader_type p_stage,
3616 unsigned start_slot, unsigned count,
3617 const struct pipe_shader_buffer *buffers,
3618 unsigned writable_bitmask)
3620 struct crocus_context *ice = (struct crocus_context *) ctx;
3621 gl_shader_stage stage = stage_from_pipe(p_stage);
3622 struct crocus_shader_state *shs = &ice->state.shaders[stage];
3624 unsigned modified_bits = u_bit_consecutive(start_slot, count);
3626 shs->bound_ssbos &= ~modified_bits;
3627 shs->writable_ssbos &= ~modified_bits;
3628 shs->writable_ssbos |= writable_bitmask << start_slot;
3630 for (unsigned i = 0; i < count; i++) {
3631 if (buffers && buffers[i].buffer) {
3632 struct crocus_resource *res = (void *) buffers[i].buffer;
3633 struct pipe_shader_buffer *ssbo = &shs->ssbo[start_slot + i];
3634 pipe_resource_reference(&ssbo->buffer, &res->base.b);
3635 ssbo->buffer_offset = buffers[i].buffer_offset;
3637 MIN2(buffers[i].buffer_size, res->bo->size - ssbo->buffer_offset);
3639 shs->bound_ssbos |= 1 << (start_slot + i);
3641 res->bind_history |= PIPE_BIND_SHADER_BUFFER;
3642 res->bind_stages |= 1 << stage;
3644 util_range_add(&res->base.b, &res->valid_buffer_range, ssbo->buffer_offset,
3645 ssbo->buffer_offset + ssbo->buffer_size);
3647 pipe_resource_reference(&shs->ssbo[start_slot + i].buffer, NULL);
3651 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << stage;
3655 crocus_delete_state(struct pipe_context *ctx, void *state)
3661 * The pipe->set_vertex_buffers() driver hook.
3663 * This translates pipe_vertex_buffer to our 3DSTATE_VERTEX_BUFFERS packet.
3666 crocus_set_vertex_buffers(struct pipe_context *ctx,
3667 unsigned start_slot, unsigned count,
3668 unsigned unbind_num_trailing_slots,
3669 bool take_ownership,
3670 const struct pipe_vertex_buffer *buffers)
3672 struct crocus_context *ice = (struct crocus_context *) ctx;
3673 struct crocus_screen *screen = (struct crocus_screen *) ctx->screen;
3674 const unsigned padding =
3675 (GFX_VERx10 < 75 && screen->devinfo.platform != INTEL_PLATFORM_BYT) * 2;
3676 ice->state.bound_vertex_buffers &=
3677 ~u_bit_consecutive64(start_slot, count + unbind_num_trailing_slots);
3679 util_set_vertex_buffers_mask(ice->state.vertex_buffers, &ice->state.bound_vertex_buffers,
3680 buffers, start_slot, count, unbind_num_trailing_slots,
3683 for (unsigned i = 0; i < count; i++) {
3684 struct pipe_vertex_buffer *state =
3685 &ice->state.vertex_buffers[start_slot + i];
3687 if (!state->is_user_buffer && state->buffer.resource) {
3688 struct crocus_resource *res = (void *)state->buffer.resource;
3689 res->bind_history |= PIPE_BIND_VERTEX_BUFFER;
3693 if (state->buffer.resource)
3694 end = state->buffer.resource->width0 + padding;
3695 ice->state.vb_end[start_slot + i] = end;
3697 ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS;
3701 static uint8_t get_wa_flags(enum isl_format format)
3703 uint8_t wa_flags = 0;
3706 case ISL_FORMAT_R10G10B10A2_USCALED:
3707 wa_flags = BRW_ATTRIB_WA_SCALE;
3709 case ISL_FORMAT_R10G10B10A2_SSCALED:
3710 wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_SCALE;
3712 case ISL_FORMAT_R10G10B10A2_UNORM:
3713 wa_flags = BRW_ATTRIB_WA_NORMALIZE;
3715 case ISL_FORMAT_R10G10B10A2_SNORM:
3716 wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_NORMALIZE;
3718 case ISL_FORMAT_R10G10B10A2_SINT:
3719 wa_flags = BRW_ATTRIB_WA_SIGN;
3721 case ISL_FORMAT_B10G10R10A2_USCALED:
3722 wa_flags = BRW_ATTRIB_WA_SCALE | BRW_ATTRIB_WA_BGRA;
3724 case ISL_FORMAT_B10G10R10A2_SSCALED:
3725 wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_SCALE | BRW_ATTRIB_WA_BGRA;
3727 case ISL_FORMAT_B10G10R10A2_UNORM:
3728 wa_flags = BRW_ATTRIB_WA_NORMALIZE | BRW_ATTRIB_WA_BGRA;
3730 case ISL_FORMAT_B10G10R10A2_SNORM:
3731 wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_NORMALIZE | BRW_ATTRIB_WA_BGRA;
3733 case ISL_FORMAT_B10G10R10A2_SINT:
3734 wa_flags = BRW_ATTRIB_WA_SIGN | BRW_ATTRIB_WA_BGRA;
3736 case ISL_FORMAT_B10G10R10A2_UINT:
3737 wa_flags = BRW_ATTRIB_WA_BGRA;
3747 * Gallium CSO for vertex elements.
3749 struct crocus_vertex_element_state {
3750 uint32_t vertex_elements[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
3752 uint32_t vf_instancing[33 * GENX(3DSTATE_VF_INSTANCING_length)];
3754 uint32_t edgeflag_ve[GENX(VERTEX_ELEMENT_STATE_length)];
3756 uint32_t edgeflag_vfi[GENX(3DSTATE_VF_INSTANCING_length)];
3758 uint32_t step_rate[16];
3759 uint8_t wa_flags[33];
3764 * The pipe->create_vertex_elements() driver hook.
3766 * This translates pipe_vertex_element to our 3DSTATE_VERTEX_ELEMENTS
3767 * and 3DSTATE_VF_INSTANCING commands. The vertex_elements and vf_instancing
3768 * arrays are ready to be emitted at draw time if no EdgeFlag or SGVs are
3769 * needed. In these cases we will need information available at draw time.
3770 * We setup edgeflag_ve and edgeflag_vfi as alternatives last
3771 * 3DSTATE_VERTEX_ELEMENT and 3DSTATE_VF_INSTANCING that can be used at
3772 * draw time if we detect that EdgeFlag is needed by the Vertex Shader.
3775 crocus_create_vertex_elements(struct pipe_context *ctx,
3777 const struct pipe_vertex_element *state)
3779 struct crocus_screen *screen = (struct crocus_screen *)ctx->screen;
3780 const struct intel_device_info *devinfo = &screen->devinfo;
3781 struct crocus_vertex_element_state *cso =
3782 malloc(sizeof(struct crocus_vertex_element_state));
3786 crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), cso->vertex_elements, ve) {
3788 1 + GENX(VERTEX_ELEMENT_STATE_length) * MAX2(count, 1) - 2;
3791 uint32_t *ve_pack_dest = &cso->vertex_elements[1];
3793 uint32_t *vfi_pack_dest = cso->vf_instancing;
3797 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
3799 ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
3800 ve.Component0Control = VFCOMP_STORE_0;
3801 ve.Component1Control = VFCOMP_STORE_0;
3802 ve.Component2Control = VFCOMP_STORE_0;
3803 ve.Component3Control = VFCOMP_STORE_1_FP;
3806 crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
3811 for (int i = 0; i < count; i++) {
3812 const struct crocus_format_info fmt =
3813 crocus_format_for_usage(devinfo, state[i].src_format, 0);
3814 unsigned comp[4] = { VFCOMP_STORE_SRC, VFCOMP_STORE_SRC,
3815 VFCOMP_STORE_SRC, VFCOMP_STORE_SRC };
3816 enum isl_format actual_fmt = fmt.fmt;
3819 cso->wa_flags[i] = get_wa_flags(fmt.fmt);
3821 if (fmt.fmt == ISL_FORMAT_R10G10B10A2_USCALED ||
3822 fmt.fmt == ISL_FORMAT_R10G10B10A2_SSCALED ||
3823 fmt.fmt == ISL_FORMAT_R10G10B10A2_UNORM ||
3824 fmt.fmt == ISL_FORMAT_R10G10B10A2_SNORM ||
3825 fmt.fmt == ISL_FORMAT_R10G10B10A2_SINT ||
3826 fmt.fmt == ISL_FORMAT_B10G10R10A2_USCALED ||
3827 fmt.fmt == ISL_FORMAT_B10G10R10A2_SSCALED ||
3828 fmt.fmt == ISL_FORMAT_B10G10R10A2_UNORM ||
3829 fmt.fmt == ISL_FORMAT_B10G10R10A2_SNORM ||
3830 fmt.fmt == ISL_FORMAT_B10G10R10A2_UINT ||
3831 fmt.fmt == ISL_FORMAT_B10G10R10A2_SINT)
3832 actual_fmt = ISL_FORMAT_R10G10B10A2_UINT;
3833 if (fmt.fmt == ISL_FORMAT_R8G8B8_SINT)
3834 actual_fmt = ISL_FORMAT_R8G8B8A8_SINT;
3835 if (fmt.fmt == ISL_FORMAT_R8G8B8_UINT)
3836 actual_fmt = ISL_FORMAT_R8G8B8A8_UINT;
3837 if (fmt.fmt == ISL_FORMAT_R16G16B16_SINT)
3838 actual_fmt = ISL_FORMAT_R16G16B16A16_SINT;
3839 if (fmt.fmt == ISL_FORMAT_R16G16B16_UINT)
3840 actual_fmt = ISL_FORMAT_R16G16B16A16_UINT;
3843 cso->step_rate[state[i].vertex_buffer_index] = state[i].instance_divisor;
3845 switch (isl_format_get_num_channels(fmt.fmt)) {
3846 case 0: comp[0] = VFCOMP_STORE_0; FALLTHROUGH;
3847 case 1: comp[1] = VFCOMP_STORE_0; FALLTHROUGH;
3848 case 2: comp[2] = VFCOMP_STORE_0; FALLTHROUGH;
3850 comp[3] = isl_format_has_int_channel(fmt.fmt) ? VFCOMP_STORE_1_INT
3851 : VFCOMP_STORE_1_FP;
3854 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
3856 ve.EdgeFlagEnable = false;
3858 ve.VertexBufferIndex = state[i].vertex_buffer_index;
3860 ve.SourceElementOffset = state[i].src_offset;
3861 ve.SourceElementFormat = actual_fmt;
3862 ve.Component0Control = comp[0];
3863 ve.Component1Control = comp[1];
3864 ve.Component2Control = comp[2];
3865 ve.Component3Control = comp[3];
3867 ve.DestinationElementOffset = i * 4;
3872 crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
3873 vi.VertexElementIndex = i;
3874 vi.InstancingEnable = state[i].instance_divisor > 0;
3875 vi.InstanceDataStepRate = state[i].instance_divisor;
3878 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
3880 vfi_pack_dest += GENX(3DSTATE_VF_INSTANCING_length);
3884 /* An alternative version of the last VE and VFI is stored so it
3885 * can be used at draw time in case Vertex Shader uses EdgeFlag
3888 const unsigned edgeflag_index = count - 1;
3889 const struct crocus_format_info fmt =
3890 crocus_format_for_usage(devinfo, state[edgeflag_index].src_format, 0);
3891 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), cso->edgeflag_ve, ve) {
3893 ve.EdgeFlagEnable = true;
3895 ve.VertexBufferIndex = state[edgeflag_index].vertex_buffer_index;
3897 ve.SourceElementOffset = state[edgeflag_index].src_offset;
3898 ve.SourceElementFormat = fmt.fmt;
3899 ve.Component0Control = VFCOMP_STORE_SRC;
3900 ve.Component1Control = VFCOMP_STORE_0;
3901 ve.Component2Control = VFCOMP_STORE_0;
3902 ve.Component3Control = VFCOMP_STORE_0;
3905 crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), cso->edgeflag_vfi, vi) {
3906 /* The vi.VertexElementIndex of the EdgeFlag Vertex Element is filled
3907 * at draw time, as it should change if SGVs are emitted.
3909 vi.InstancingEnable = state[edgeflag_index].instance_divisor > 0;
3910 vi.InstanceDataStepRate = state[edgeflag_index].instance_divisor;
3919 * The pipe->bind_vertex_elements_state() driver hook.
3922 crocus_bind_vertex_elements_state(struct pipe_context *ctx, void *state)
3924 struct crocus_context *ice = (struct crocus_context *) ctx;
3926 struct crocus_vertex_element_state *old_cso = ice->state.cso_vertex_elements;
3927 struct crocus_vertex_element_state *new_cso = state;
3929 if (new_cso && cso_changed(count))
3930 ice->state.dirty |= CROCUS_DIRTY_GEN8_VF_SGVS;
3932 ice->state.cso_vertex_elements = state;
3933 ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS;
3934 ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[CROCUS_NOS_VERTEX_ELEMENTS];
3938 struct crocus_streamout_counter {
3939 uint32_t offset_start;
3940 uint32_t offset_end;
3946 * Gallium CSO for stream output (transform feedback) targets.
3948 struct crocus_stream_output_target {
3949 struct pipe_stream_output_target base;
3951 /** Stride (bytes-per-vertex) during this transform feedback operation */
3954 /** Has 3DSTATE_SO_BUFFER actually been emitted, zeroing the offsets? */
3957 struct crocus_resource *offset_res;
3958 uint32_t offset_offset;
3962 struct crocus_streamout_counter prev_count;
3963 struct crocus_streamout_counter count;
3966 /** Does the next 3DSTATE_SO_BUFFER need to zero the offsets? */
3973 crocus_get_so_offset(struct pipe_stream_output_target *so)
3975 struct crocus_stream_output_target *tgt = (void *)so;
3976 struct pipe_transfer *transfer;
3977 struct pipe_box box;
3979 u_box_1d(tgt->offset_offset, 4, &box);
3980 void *val = so->context->buffer_map(so->context, &tgt->offset_res->base.b,
3981 0, PIPE_MAP_DIRECTLY,
3984 result = *(uint32_t *)val;
3985 so->context->buffer_unmap(so->context, transfer);
3987 return result / tgt->stride;
3993 compute_vertices_written_so_far(struct crocus_context *ice,
3994 struct crocus_stream_output_target *tgt,
3995 struct crocus_streamout_counter *count,
3999 crocus_get_so_offset(struct pipe_stream_output_target *so)
4001 struct crocus_stream_output_target *tgt = (void *)so;
4002 struct crocus_context *ice = (void *)so->context;
4004 uint64_t vert_written;
4005 compute_vertices_written_so_far(ice, tgt, &tgt->prev_count, &vert_written);
4006 return vert_written;
4011 * The pipe->create_stream_output_target() driver hook.
4013 * "Target" here refers to a destination buffer. We translate this into
4014 * a 3DSTATE_SO_BUFFER packet. We can handle most fields, but don't yet
4015 * know which buffer this represents, or whether we ought to zero the
4016 * write-offsets, or append. Those are handled in the set() hook.
4018 static struct pipe_stream_output_target *
4019 crocus_create_stream_output_target(struct pipe_context *ctx,
4020 struct pipe_resource *p_res,
4021 unsigned buffer_offset,
4022 unsigned buffer_size)
4024 struct crocus_resource *res = (void *) p_res;
4025 struct crocus_stream_output_target *cso = calloc(1, sizeof(*cso));
4029 res->bind_history |= PIPE_BIND_STREAM_OUTPUT;
4031 pipe_reference_init(&cso->base.reference, 1);
4032 pipe_resource_reference(&cso->base.buffer, p_res);
4033 cso->base.buffer_offset = buffer_offset;
4034 cso->base.buffer_size = buffer_size;
4035 cso->base.context = ctx;
4037 util_range_add(&res->base.b, &res->valid_buffer_range, buffer_offset,
4038 buffer_offset + buffer_size);
4040 struct crocus_context *ice = (struct crocus_context *) ctx;
4042 u_upload_alloc(ice->ctx.stream_uploader, 0, sizeof(uint32_t), 4,
4043 &cso->offset_offset,
4044 (struct pipe_resource **)&cso->offset_res,
4052 crocus_stream_output_target_destroy(struct pipe_context *ctx,
4053 struct pipe_stream_output_target *state)
4055 struct crocus_stream_output_target *cso = (void *) state;
4057 pipe_resource_reference((struct pipe_resource **)&cso->offset_res, NULL);
4058 pipe_resource_reference(&cso->base.buffer, NULL);
4063 #define GEN6_SO_NUM_PRIMS_WRITTEN 0x2288
4064 #define GEN7_SO_WRITE_OFFSET(n) (0x5280 + (n) * 4)
4068 aggregate_stream_counter(struct crocus_batch *batch, struct crocus_stream_output_target *tgt,
4069 struct crocus_streamout_counter *counter)
4071 uint64_t *prim_counts = tgt->prim_map;
4073 if (crocus_batch_references(batch, tgt->offset_res->bo)) {
4074 struct pipe_fence_handle *out_fence = NULL;
4075 batch->ice->ctx.flush(&batch->ice->ctx, &out_fence, 0);
4076 batch->screen->base.fence_finish(&batch->screen->base, &batch->ice->ctx, out_fence, UINT64_MAX);
4077 batch->screen->base.fence_reference(&batch->screen->base, &out_fence, NULL);
4080 for (unsigned i = counter->offset_start / sizeof(uint64_t); i < counter->offset_end / sizeof(uint64_t); i += 2) {
4081 counter->accum += prim_counts[i + 1] - prim_counts[i];
4083 tgt->count.offset_start = tgt->count.offset_end = 0;
4087 crocus_stream_store_prims_written(struct crocus_batch *batch,
4088 struct crocus_stream_output_target *tgt)
4090 if (!tgt->offset_res) {
4091 u_upload_alloc(batch->ice->ctx.stream_uploader, 0, 4096, 4,
4092 &tgt->offset_offset,
4093 (struct pipe_resource **)&tgt->offset_res,
4095 tgt->count.offset_start = tgt->count.offset_end = 0;
4098 if (tgt->count.offset_end + 16 >= 4096) {
4099 aggregate_stream_counter(batch, tgt, &tgt->prev_count);
4100 aggregate_stream_counter(batch, tgt, &tgt->count);
4103 crocus_emit_mi_flush(batch);
4104 crocus_store_register_mem64(batch, GEN6_SO_NUM_PRIMS_WRITTEN,
4105 tgt->offset_res->bo,
4106 tgt->count.offset_end + tgt->offset_offset, false);
4107 tgt->count.offset_end += 8;
4111 compute_vertices_written_so_far(struct crocus_context *ice,
4112 struct crocus_stream_output_target *tgt,
4113 struct crocus_streamout_counter *counter,
4116 //TODO vertices per prim
4117 aggregate_stream_counter(&ice->batches[0], tgt, counter);
4119 *svbi = counter->accum * ice->state.last_xfb_verts_per_prim;
4123 * The pipe->set_stream_output_targets() driver hook.
4125 * At this point, we know which targets are bound to a particular index,
4126 * and also whether we want to append or start over. We can finish the
4127 * 3DSTATE_SO_BUFFER packets we started earlier.
4130 crocus_set_stream_output_targets(struct pipe_context *ctx,
4131 unsigned num_targets,
4132 struct pipe_stream_output_target **targets,
4133 const unsigned *offsets)
4135 struct crocus_context *ice = (struct crocus_context *) ctx;
4136 struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
4137 struct pipe_stream_output_target *old_tgt[4] = { NULL, NULL, NULL, NULL };
4138 const bool active = num_targets > 0;
4139 if (ice->state.streamout_active != active) {
4140 ice->state.streamout_active = active;
4142 ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
4144 ice->state.dirty |= CROCUS_DIRTY_GEN4_FF_GS_PROG;
4147 /* We only emit 3DSTATE_SO_DECL_LIST when streamout is active, because
4148 * it's a non-pipelined command. If we're switching streamout on, we
4149 * may have missed emitting it earlier, so do so now. (We're already
4150 * taking a stall to update 3DSTATE_SO_BUFFERS anyway...)
4154 ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST;
4158 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4159 struct crocus_stream_output_target *tgt =
4160 (void *) ice->state.so_target[i];
4162 struct crocus_resource *res = (void *) tgt->base.buffer;
4164 flush |= crocus_flush_bits_for_history(res);
4165 crocus_dirty_for_history(ice, res);
4168 crocus_emit_pipe_control_flush(&ice->batches[CROCUS_BATCH_RENDER],
4169 "make streamout results visible", flush);
4173 ice->state.so_targets = num_targets;
4174 for (int i = 0; i < 4; i++) {
4175 pipe_so_target_reference(&old_tgt[i], ice->state.so_target[i]);
4176 pipe_so_target_reference(&ice->state.so_target[i],
4177 i < num_targets ? targets[i] : NULL);
4181 bool stored_num_prims = false;
4182 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4184 struct crocus_stream_output_target *tgt =
4185 (void *) ice->state.so_target[i];
4189 if (offsets[i] == 0) {
4190 // This means that we're supposed to ignore anything written to
4191 // the buffer before. We can do this by just clearing out the
4192 // count of writes to the prim count buffer.
4193 tgt->count.offset_start = tgt->count.offset_end;
4194 tgt->count.accum = 0;
4195 ice->state.svbi = 0;
4197 if (tgt->offset_res) {
4198 compute_vertices_written_so_far(ice, tgt, &tgt->count, &ice->state.svbi);
4199 tgt->count.offset_start = tgt->count.offset_end;
4203 if (!stored_num_prims) {
4204 crocus_stream_store_prims_written(batch, tgt);
4205 stored_num_prims = true;
4208 struct crocus_stream_output_target *tgt =
4209 (void *) old_tgt[i];
4211 if (!stored_num_prims) {
4212 crocus_stream_store_prims_written(batch, tgt);
4213 stored_num_prims = true;
4216 if (tgt->offset_res) {
4217 tgt->prev_count = tgt->count;
4221 pipe_so_target_reference(&old_tgt[i], NULL);
4223 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_GS;
4225 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4227 struct crocus_stream_output_target *tgt =
4228 (void *) ice->state.so_target[i];
4230 if (offsets[i] == 0) {
4233 tgt->zero_offset = true;
4235 crocus_load_register_imm32(batch, GEN7_SO_WRITE_OFFSET(i), 0);
4238 crocus_load_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i),
4239 tgt->offset_res->bo,
4240 tgt->offset_offset);
4242 struct crocus_stream_output_target *tgt =
4243 (void *) old_tgt[i];
4245 crocus_store_register_mem32(batch, GEN7_SO_WRITE_OFFSET(i),
4246 tgt->offset_res->bo,
4247 tgt->offset_offset, false);
4249 pipe_so_target_reference(&old_tgt[i], NULL);
4252 /* No need to update 3DSTATE_SO_BUFFER unless SOL is active. */
4256 ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
4258 ice->state.dirty |= CROCUS_DIRTY_GEN6_SVBI;
4266 * An crocus-vtable helper for encoding the 3DSTATE_SO_DECL_LIST and
4267 * 3DSTATE_STREAMOUT packets.
4269 * 3DSTATE_SO_DECL_LIST is a list of shader outputs we want the streamout
4270 * hardware to record. We can create it entirely based on the shader, with
4271 * no dynamic state dependencies.
4273 * 3DSTATE_STREAMOUT is an annoying mix of shader-based information and
4274 * state-based settings. We capture the shader-related ones here, and merge
4275 * the rest in at draw time.
4278 crocus_create_so_decl_list(const struct pipe_stream_output_info *info,
4279 const struct brw_vue_map *vue_map)
4281 struct GENX(SO_DECL) so_decl[PIPE_MAX_VERTEX_STREAMS][128];
4282 int buffer_mask[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4283 int next_offset[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4284 int decls[PIPE_MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
4286 STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= PIPE_MAX_SO_OUTPUTS);
4288 memset(so_decl, 0, sizeof(so_decl));
4290 /* Construct the list of SO_DECLs to be emitted. The formatting of the
4291 * command feels strange -- each dword pair contains a SO_DECL per stream.
4293 for (unsigned i = 0; i < info->num_outputs; i++) {
4294 const struct pipe_stream_output *output = &info->output[i];
4295 const int buffer = output->output_buffer;
4296 const int varying = output->register_index;
4297 const unsigned stream_id = output->stream;
4298 assert(stream_id < PIPE_MAX_VERTEX_STREAMS);
4300 buffer_mask[stream_id] |= 1 << buffer;
4302 assert(vue_map->varying_to_slot[varying] >= 0);
4304 /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
4305 * array. Instead, it simply increments DstOffset for the following
4306 * input by the number of components that should be skipped.
4308 * Our hardware is unusual in that it requires us to program SO_DECLs
4309 * for fake "hole" components, rather than simply taking the offset
4310 * for each real varying. Each hole can have size 1, 2, 3, or 4; we
4311 * program as many size = 4 holes as we can, then a final hole to
4312 * accommodate the final 1, 2, or 3 remaining.
4314 int skip_components = output->dst_offset - next_offset[buffer];
4316 while (skip_components > 0) {
4317 so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4319 .OutputBufferSlot = output->output_buffer,
4320 .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
4322 skip_components -= 4;
4325 next_offset[buffer] = output->dst_offset + output->num_components;
4327 so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
4328 .OutputBufferSlot = output->output_buffer,
4329 .RegisterIndex = vue_map->varying_to_slot[varying],
4331 ((1 << output->num_components) - 1) << output->start_component,
4334 if (decls[stream_id] > max_decls)
4335 max_decls = decls[stream_id];
4338 unsigned dwords = GENX(3DSTATE_STREAMOUT_length) + (3 + 2 * max_decls);
4339 uint32_t *map = ralloc_size(NULL, sizeof(uint32_t) * dwords);
4340 uint32_t *so_decl_map = map + GENX(3DSTATE_STREAMOUT_length);
4342 crocus_pack_command(GENX(3DSTATE_STREAMOUT), map, sol) {
4343 int urb_entry_read_offset = 0;
4344 int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
4345 urb_entry_read_offset;
4347 /* We always read the whole vertex. This could be reduced at some
4348 * point by reading less and offsetting the register index in the
4351 sol.Stream0VertexReadOffset = urb_entry_read_offset;
4352 sol.Stream0VertexReadLength = urb_entry_read_length - 1;
4353 sol.Stream1VertexReadOffset = urb_entry_read_offset;
4354 sol.Stream1VertexReadLength = urb_entry_read_length - 1;
4355 sol.Stream2VertexReadOffset = urb_entry_read_offset;
4356 sol.Stream2VertexReadLength = urb_entry_read_length - 1;
4357 sol.Stream3VertexReadOffset = urb_entry_read_offset;
4358 sol.Stream3VertexReadLength = urb_entry_read_length - 1;
4360 // TODO: Double-check that stride == 0 means no buffer. Probably this
4361 // needs to go elsewhere, where the buffer enable stuff is actually
4364 sol.SOBufferEnable0 = !!info->stride[0];
4365 sol.SOBufferEnable1 = !!info->stride[1];
4366 sol.SOBufferEnable2 = !!info->stride[2];
4367 sol.SOBufferEnable3 = !!info->stride[3];
4369 /* Set buffer pitches; 0 means unbound. */
4370 sol.Buffer0SurfacePitch = 4 * info->stride[0];
4371 sol.Buffer1SurfacePitch = 4 * info->stride[1];
4372 sol.Buffer2SurfacePitch = 4 * info->stride[2];
4373 sol.Buffer3SurfacePitch = 4 * info->stride[3];
4377 crocus_pack_command(GENX(3DSTATE_SO_DECL_LIST), so_decl_map, list) {
4378 list.DWordLength = 3 + 2 * max_decls - 2;
4379 list.StreamtoBufferSelects0 = buffer_mask[0];
4380 list.StreamtoBufferSelects1 = buffer_mask[1];
4381 list.StreamtoBufferSelects2 = buffer_mask[2];
4382 list.StreamtoBufferSelects3 = buffer_mask[3];
4383 list.NumEntries0 = decls[0];
4384 list.NumEntries1 = decls[1];
4385 list.NumEntries2 = decls[2];
4386 list.NumEntries3 = decls[3];
4389 for (int i = 0; i < max_decls; i++) {
4390 crocus_pack_state(GENX(SO_DECL_ENTRY), so_decl_map + 3 + i * 2, entry) {
4391 entry.Stream0Decl = so_decl[0][i];
4392 entry.Stream1Decl = so_decl[1][i];
4393 entry.Stream2Decl = so_decl[2][i];
4394 entry.Stream3Decl = so_decl[3][i];
4404 crocus_emit_so_svbi(struct crocus_context *ice)
4406 struct crocus_batch *batch = &ice->batches[CROCUS_BATCH_RENDER];
4408 unsigned max_vertex = 0xffffffff;
4409 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
4410 struct crocus_stream_output_target *tgt =
4411 (void *) ice->state.so_target[i];
4413 max_vertex = MIN2(max_vertex, tgt->base.buffer_size / tgt->stride);
4416 crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) {
4417 svbi.IndexNumber = 0;
4418 svbi.StreamedVertexBufferIndex = (uint32_t)ice->state.svbi; /* fix when resuming, based on target's prim count */
4419 svbi.MaximumIndex = max_vertex;
4422 /* initialize the rest of the SVBI's to reasonable values so that we don't
4423 * run out of room writing the regular data.
4425 for (int i = 1; i < 4; i++) {
4426 crocus_emit_cmd(batch, GENX(3DSTATE_GS_SVB_INDEX), svbi) {
4427 svbi.IndexNumber = i;
4428 svbi.StreamedVertexBufferIndex = 0;
4429 svbi.MaximumIndex = 0xffffffff;
4439 crocus_is_drawing_points(const struct crocus_context *ice)
4441 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4443 if (cso_rast->cso.fill_front == PIPE_POLYGON_MODE_POINT ||
4444 cso_rast->cso.fill_back == PIPE_POLYGON_MODE_POINT)
4447 if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
4448 const struct brw_gs_prog_data *gs_prog_data =
4449 (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
4450 return gs_prog_data->output_topology == _3DPRIM_POINTLIST;
4451 } else if (ice->shaders.prog[MESA_SHADER_TESS_EVAL]) {
4452 const struct brw_tes_prog_data *tes_data =
4453 (void *) ice->shaders.prog[MESA_SHADER_TESS_EVAL]->prog_data;
4454 return tes_data->output_topology == BRW_TESS_OUTPUT_TOPOLOGY_POINT;
4456 return ice->state.prim_mode == PIPE_PRIM_POINTS;
4464 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr,
4465 const struct brw_vue_map *vue_map,
4466 int urb_entry_read_offset, int fs_attr,
4467 bool two_side_color, uint32_t *max_source_attr)
4469 /* Find the VUE slot for this attribute. */
4470 int slot = vue_map->varying_to_slot[fs_attr];
4472 /* Viewport and Layer are stored in the VUE header. We need to override
4473 * them to zero if earlier stages didn't write them, as GL requires that
4474 * they read back as zero when not explicitly set.
4476 if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) {
4477 attr->ComponentOverrideX = true;
4478 attr->ComponentOverrideW = true;
4479 attr->ConstantSource = CONST_0000;
4481 if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
4482 attr->ComponentOverrideY = true;
4483 if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
4484 attr->ComponentOverrideZ = true;
4489 /* If there was only a back color written but not front, use back
4490 * as the color instead of undefined
4492 if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
4493 slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
4494 if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
4495 slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
4498 /* This attribute does not exist in the VUE--that means that the vertex
4499 * shader did not write to it. This means that either:
4501 * (a) This attribute is a texture coordinate, and it is going to be
4502 * replaced with point coordinates (as a consequence of a call to
4503 * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the
4504 * hardware will ignore whatever attribute override we supply.
4506 * (b) This attribute is read by the fragment shader but not written by
4507 * the vertex shader, so its value is undefined. Therefore the
4508 * attribute override we supply doesn't matter.
4510 * (c) This attribute is gl_PrimitiveID, and it wasn't written by the
4511 * previous shader stage.
4513 * Note that we don't have to worry about the cases where the attribute
4514 * is gl_PointCoord or is undergoing point sprite coordinate
4515 * replacement, because in those cases, this function isn't called.
4517 * In case (c), we need to program the attribute overrides so that the
4518 * primitive ID will be stored in this slot. In every other case, the
4519 * attribute override we supply doesn't matter. So just go ahead and
4520 * program primitive ID in every case.
4522 attr->ComponentOverrideW = true;
4523 attr->ComponentOverrideX = true;
4524 attr->ComponentOverrideY = true;
4525 attr->ComponentOverrideZ = true;
4526 attr->ConstantSource = PRIM_ID;
4530 /* Compute the location of the attribute relative to urb_entry_read_offset.
4531 * Each increment of urb_entry_read_offset represents a 256-bit value, so
4532 * it counts for two 128-bit VUE slots.
4534 int source_attr = slot - 2 * urb_entry_read_offset;
4535 assert(source_attr >= 0 && source_attr < 32);
4537 /* If we are doing two-sided color, and the VUE slot following this one
4538 * represents a back-facing color, then we need to instruct the SF unit to
4539 * do back-facing swizzling.
4541 bool swizzling = two_side_color &&
4542 ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
4543 vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
4544 (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
4545 vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1));
4547 /* Update max_source_attr. If swizzling, the SF will read this slot + 1. */
4548 if (*max_source_attr < source_attr + swizzling)
4549 *max_source_attr = source_attr + swizzling;
4551 attr->SourceAttribute = source_attr;
4553 attr->SwizzleSelect = INPUTATTR_FACING;
4557 calculate_attr_overrides(
4558 const struct crocus_context *ice,
4559 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides,
4560 uint32_t *point_sprite_enables,
4561 uint32_t *urb_entry_read_length,
4562 uint32_t *urb_entry_read_offset)
4564 const struct brw_wm_prog_data *wm_prog_data = (void *)
4565 ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
4566 const struct brw_vue_map *vue_map = ice->shaders.last_vue_map;
4567 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4568 uint32_t max_source_attr = 0;
4569 const struct shader_info *fs_info =
4570 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
4573 brw_compute_first_urb_slot_required(fs_info->inputs_read, vue_map);
4575 /* Each URB offset packs two varying slots */
4576 assert(first_slot % 2 == 0);
4577 *urb_entry_read_offset = first_slot / 2;
4578 *point_sprite_enables = 0;
4580 for (int fs_attr = 0; fs_attr < VARYING_SLOT_MAX; fs_attr++) {
4581 const int input_index = wm_prog_data->urb_setup[fs_attr];
4583 if (input_index < 0)
4586 bool point_sprite = false;
4587 if (crocus_is_drawing_points(ice)) {
4588 if (fs_attr >= VARYING_SLOT_TEX0 &&
4589 fs_attr <= VARYING_SLOT_TEX7 &&
4590 cso_rast->cso.sprite_coord_enable & (1 << (fs_attr - VARYING_SLOT_TEX0)))
4591 point_sprite = true;
4593 if (fs_attr == VARYING_SLOT_PNTC)
4594 point_sprite = true;
4597 *point_sprite_enables |= 1U << input_index;
4600 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 };
4601 if (!point_sprite) {
4602 get_attr_override(&attribute, vue_map, *urb_entry_read_offset, fs_attr,
4603 cso_rast->cso.light_twoside, &max_source_attr);
4606 /* The hardware can only do the overrides on 16 overrides at a
4607 * time, and the other up to 16 have to be lined up so that the
4608 * input index = the output index. We'll need to do some
4609 * tweaking to make sure that's the case.
4611 if (input_index < 16)
4612 attr_overrides[input_index] = attribute;
4614 assert(attribute.SourceAttribute == input_index);
4617 /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
4618 * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
4620 * "This field should be set to the minimum length required to read the
4621 * maximum source attribute. The maximum source attribute is indicated
4622 * by the maximum value of the enabled Attribute # Source Attribute if
4623 * Attribute Swizzle Enable is set, Number of Output Attributes-1 if
4624 * enable is not set.
4625 * read_length = ceiling((max_source_attr + 1) / 2)
4627 * [errata] Corruption/Hang possible if length programmed larger than
4630 * Similar text exists for Ivy Bridge.
4632 *urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2);
4638 crocus_emit_sbe(struct crocus_batch *batch, const struct crocus_context *ice)
4640 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4641 const struct brw_wm_prog_data *wm_prog_data = (void *)
4642 ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
4644 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = { { 0 } };
4646 #define attr_overrides sbe.Attribute
4649 uint32_t urb_entry_read_length;
4650 uint32_t urb_entry_read_offset;
4651 uint32_t point_sprite_enables;
4653 crocus_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) {
4654 sbe.AttributeSwizzleEnable = true;
4655 sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
4656 sbe.PointSpriteTextureCoordinateOrigin = cso_rast->cso.sprite_coord_mode;
4658 calculate_attr_overrides(ice,
4660 &point_sprite_enables,
4661 &urb_entry_read_length,
4662 &urb_entry_read_offset);
4663 sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
4664 sbe.VertexURBEntryReadLength = urb_entry_read_length;
4665 sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
4666 sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables;
4668 sbe.ForceVertexURBEntryReadLength = true;
4669 sbe.ForceVertexURBEntryReadOffset = true;
4673 crocus_emit_cmd(batch, GENX(3DSTATE_SBE_SWIZ), sbes) {
4674 for (int i = 0; i < 16; i++)
4675 sbes.Attribute[i] = attr_overrides[i];
4681 /* ------------------------------------------------------------------- */
4684 * Populate VS program key fields based on the current state.
4687 crocus_populate_vs_key(const struct crocus_context *ice,
4688 const struct shader_info *info,
4689 gl_shader_stage last_stage,
4690 struct brw_vs_prog_key *key)
4692 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4694 if (info->clip_distance_array_size == 0 &&
4695 (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4696 last_stage == MESA_SHADER_VERTEX)
4697 key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4699 if (last_stage == MESA_SHADER_VERTEX &&
4700 info->outputs_written & (VARYING_BIT_PSIZ))
4701 key->clamp_pointsize = 1;
4704 key->copy_edgeflag = (cso_rast->cso.fill_back != PIPE_POLYGON_MODE_FILL ||
4705 cso_rast->cso.fill_front != PIPE_POLYGON_MODE_FILL);
4706 key->point_coord_replace = cso_rast->cso.sprite_coord_enable & 0xff;
4709 key->clamp_vertex_color = cso_rast->cso.clamp_vertex_color;
4712 uint64_t inputs_read = info->inputs_read;
4714 while (inputs_read) {
4715 int i = u_bit_scan64(&inputs_read);
4716 key->gl_attrib_wa_flags[i] = ice->state.cso_vertex_elements->wa_flags[ve_idx];
4723 * Populate TCS program key fields based on the current state.
4726 crocus_populate_tcs_key(const struct crocus_context *ice,
4727 struct brw_tcs_prog_key *key)
4732 * Populate TES program key fields based on the current state.
4735 crocus_populate_tes_key(const struct crocus_context *ice,
4736 const struct shader_info *info,
4737 gl_shader_stage last_stage,
4738 struct brw_tes_prog_key *key)
4740 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4742 if (info->clip_distance_array_size == 0 &&
4743 (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4744 last_stage == MESA_SHADER_TESS_EVAL)
4745 key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4747 if (last_stage == MESA_SHADER_TESS_EVAL &&
4748 info->outputs_written & (VARYING_BIT_PSIZ))
4749 key->clamp_pointsize = 1;
4753 * Populate GS program key fields based on the current state.
4756 crocus_populate_gs_key(const struct crocus_context *ice,
4757 const struct shader_info *info,
4758 gl_shader_stage last_stage,
4759 struct brw_gs_prog_key *key)
4761 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
4763 if (info->clip_distance_array_size == 0 &&
4764 (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4765 last_stage == MESA_SHADER_GEOMETRY)
4766 key->nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4768 if (last_stage == MESA_SHADER_GEOMETRY &&
4769 info->outputs_written & (VARYING_BIT_PSIZ))
4770 key->clamp_pointsize = 1;
4774 * Populate FS program key fields based on the current state.
4777 crocus_populate_fs_key(const struct crocus_context *ice,
4778 const struct shader_info *info,
4779 struct brw_wm_prog_key *key)
4781 struct crocus_screen *screen = (void *) ice->ctx.screen;
4782 const struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
4783 const struct crocus_depth_stencil_alpha_state *zsa = ice->state.cso_zsa;
4784 const struct crocus_rasterizer_state *rast = ice->state.cso_rast;
4785 const struct crocus_blend_state *blend = ice->state.cso_blend;
4788 uint32_t lookup = 0;
4790 if (info->fs.uses_discard || zsa->cso.alpha_enabled)
4791 lookup |= BRW_WM_IZ_PS_KILL_ALPHATEST_BIT;
4793 if (info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4794 lookup |= BRW_WM_IZ_PS_COMPUTES_DEPTH_BIT;
4796 if (fb->zsbuf && zsa->cso.depth_enabled) {
4797 lookup |= BRW_WM_IZ_DEPTH_TEST_ENABLE_BIT;
4799 if (zsa->cso.depth_writemask)
4800 lookup |= BRW_WM_IZ_DEPTH_WRITE_ENABLE_BIT;
4803 if (zsa->cso.stencil[0].enabled || zsa->cso.stencil[1].enabled) {
4804 lookup |= BRW_WM_IZ_STENCIL_TEST_ENABLE_BIT;
4805 if (zsa->cso.stencil[0].writemask || zsa->cso.stencil[1].writemask)
4806 lookup |= BRW_WM_IZ_STENCIL_WRITE_ENABLE_BIT;
4808 key->iz_lookup = lookup;
4809 key->stats_wm = ice->state.stats_wm;
4812 uint32_t line_aa = BRW_WM_AA_NEVER;
4813 if (rast->cso.line_smooth) {
4814 int reduced_prim = ice->state.reduced_prim_mode;
4815 if (reduced_prim == PIPE_PRIM_LINES)
4816 line_aa = BRW_WM_AA_ALWAYS;
4817 else if (reduced_prim == PIPE_PRIM_TRIANGLES) {
4818 if (rast->cso.fill_front == PIPE_POLYGON_MODE_LINE) {
4819 line_aa = BRW_WM_AA_SOMETIMES;
4821 if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE ||
4822 rast->cso.cull_face == PIPE_FACE_BACK)
4823 line_aa = BRW_WM_AA_ALWAYS;
4824 } else if (rast->cso.fill_back == PIPE_POLYGON_MODE_LINE) {
4825 line_aa = BRW_WM_AA_SOMETIMES;
4827 if (rast->cso.cull_face == PIPE_FACE_FRONT)
4828 line_aa = BRW_WM_AA_ALWAYS;
4832 key->line_aa = line_aa;
4834 key->nr_color_regions = fb->nr_cbufs;
4836 key->clamp_fragment_color = rast->cso.clamp_fragment_color;
4838 key->alpha_to_coverage = blend->cso.alpha_to_coverage;
4840 key->alpha_test_replicate_alpha = fb->nr_cbufs > 1 && zsa->cso.alpha_enabled;
4842 key->flat_shade = rast->cso.flatshade &&
4843 (info->inputs_read & (VARYING_BIT_COL0 | VARYING_BIT_COL1));
4845 key->persample_interp = rast->cso.force_persample_interp;
4846 key->multisample_fbo = rast->cso.multisample && fb->samples > 1;
4848 key->ignore_sample_mask_out = !key->multisample_fbo;
4849 key->coherent_fb_fetch = false; // TODO: needed?
4851 key->force_dual_color_blend =
4852 screen->driconf.dual_color_blend_by_location &&
4853 (blend->blend_enables & 1) && blend->dual_color_blending;
4856 if (fb->nr_cbufs > 1 && zsa->cso.alpha_enabled) {
4857 key->emit_alpha_test = true;
4858 key->alpha_test_func = zsa->cso.alpha_func;
4859 key->alpha_test_ref = zsa->cso.alpha_ref_value;
4865 crocus_populate_cs_key(const struct crocus_context *ice,
4866 struct brw_cs_prog_key *key)
4871 #define KSP(ice, shader) ro_bo((ice)->shaders.cache_bo, (shader)->offset);
4874 KSP(const struct crocus_context *ice, const struct crocus_compiled_shader *shader)
4876 return shader->offset;
4880 /* Gen11 workaround table #2056 WABTPPrefetchDisable suggests to disable
4881 * prefetching of binding tables in A0 and B0 steppings. XXX: Revisit
4882 * this WA on C0 stepping.
4884 * TODO: Fill out SamplerCount for prefetching?
4887 #define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage) \
4888 pkt.KernelStartPointer = KSP(ice, shader); \
4889 pkt.BindingTableEntryCount = shader->bt.size_bytes / 4; \
4890 pkt.FloatingPointMode = prog_data->use_alt_mode; \
4892 pkt.DispatchGRFStartRegisterForURBData = \
4893 prog_data->dispatch_grf_start_reg; \
4894 pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length; \
4895 pkt.prefix##URBEntryReadOffset = 0; \
4897 pkt.StatisticsEnable = true; \
4898 pkt.Enable = true; \
4900 if (prog_data->total_scratch) { \
4901 struct crocus_bo *bo = \
4902 crocus_get_scratch_space(ice, prog_data->total_scratch, stage); \
4903 pkt.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11; \
4904 pkt.ScratchSpaceBasePointer = rw_bo(bo, 0); \
4907 /* ------------------------------------------------------------------- */
4909 static const uint32_t push_constant_opcodes[] = {
4910 [MESA_SHADER_VERTEX] = 21,
4911 [MESA_SHADER_TESS_CTRL] = 25, /* HS */
4912 [MESA_SHADER_TESS_EVAL] = 26, /* DS */
4913 [MESA_SHADER_GEOMETRY] = 22,
4914 [MESA_SHADER_FRAGMENT] = 23,
4915 [MESA_SHADER_COMPUTE] = 0,
4920 emit_sized_null_surface(struct crocus_batch *batch,
4921 unsigned width, unsigned height,
4922 unsigned layers, unsigned levels,
4923 unsigned minimum_array_element,
4924 uint32_t *out_offset)
4926 struct isl_device *isl_dev = &batch->screen->isl_dev;
4927 uint32_t *surf = stream_state(batch, isl_dev->ss.size,
4930 //TODO gen 6 multisample crash
4931 isl_null_fill_state(isl_dev, surf,
4932 .size = isl_extent3d(width, height, layers),
4934 .minimum_array_element = minimum_array_element);
4937 emit_null_surface(struct crocus_batch *batch,
4938 uint32_t *out_offset)
4940 emit_sized_null_surface(batch, 1, 1, 1, 0, 0, out_offset);
4944 emit_null_fb_surface(struct crocus_batch *batch,
4945 struct crocus_context *ice,
4946 uint32_t *out_offset)
4948 uint32_t width, height, layers, level, layer;
4949 /* If set_framebuffer_state() was never called, fall back to 1x1x1 */
4950 if (ice->state.framebuffer.width == 0 && ice->state.framebuffer.height == 0) {
4951 emit_null_surface(batch, out_offset);
4955 struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
4956 width = MAX2(cso->width, 1);
4957 height = MAX2(cso->height, 1);
4958 layers = cso->layers ? cso->layers : 1;
4962 if (cso->nr_cbufs == 0 && cso->zsbuf) {
4963 width = cso->zsbuf->width;
4964 height = cso->zsbuf->height;
4965 level = cso->zsbuf->u.tex.level;
4966 layer = cso->zsbuf->u.tex.first_layer;
4968 emit_sized_null_surface(batch, width, height,
4969 layers, level, layer,
4974 emit_surface_state(struct crocus_batch *batch,
4975 struct crocus_resource *res,
4976 const struct isl_surf *in_surf,
4978 struct isl_view *in_view,
4980 enum isl_aux_usage aux_usage,
4982 uint32_t write_disables,
4983 uint32_t *surf_state,
4984 uint32_t addr_offset)
4986 struct isl_device *isl_dev = &batch->screen->isl_dev;
4987 uint32_t reloc = RELOC_32BIT;
4988 uint64_t offset_B = res->offset;
4989 uint32_t tile_x_sa = 0, tile_y_sa = 0;
4992 reloc |= RELOC_WRITE;
4994 struct isl_surf surf = *in_surf;
4995 struct isl_view view = *in_view;
4997 if (res->base.b.target == PIPE_TEXTURE_3D && view.array_len == 1) {
4998 isl_surf_get_image_surf(isl_dev, in_surf,
5000 view.base_array_layer,
5002 &tile_x_sa, &tile_y_sa);
5003 view.base_array_layer = 0;
5004 view.base_level = 0;
5005 } else if (res->base.b.target == PIPE_TEXTURE_CUBE && GFX_VER == 4) {
5006 isl_surf_get_image_surf(isl_dev, in_surf,
5007 view.base_level, view.base_array_layer,
5010 &tile_x_sa, &tile_y_sa);
5011 view.base_array_layer = 0;
5012 view.base_level = 0;
5013 } else if (res->base.b.target == PIPE_TEXTURE_1D_ARRAY)
5014 surf.dim = ISL_SURF_DIM_2D;
5017 union isl_color_value clear_color = { .u32 = { 0, 0, 0, 0 } };
5018 struct crocus_bo *aux_bo = NULL;
5019 uint32_t aux_offset = 0;
5020 struct isl_surf *aux_surf = NULL;
5021 if (aux_usage != ISL_AUX_USAGE_NONE) {
5022 aux_surf = &res->aux.surf;
5023 aux_offset = res->aux.offset;
5024 aux_bo = res->aux.bo;
5026 clear_color = crocus_resource_get_clear_color(res);
5029 isl_surf_fill_state(isl_dev, surf_state,
5032 .address = crocus_state_reloc(batch,
5033 addr_offset + isl_dev->ss.addr_offset,
5034 res->bo, offset_B, reloc),
5035 .aux_surf = aux_surf,
5036 .aux_usage = aux_usage,
5037 .aux_address = aux_offset,
5038 .mocs = crocus_mocs(res->bo, isl_dev),
5039 .clear_color = clear_color,
5040 .use_clear_address = false,
5042 .x_offset_sa = tile_x_sa,
5043 .y_offset_sa = tile_y_sa,
5045 .blend_enable = blend_enable,
5046 .write_disables = write_disables,
5051 /* On gen7 and prior, the upper 20 bits of surface state DWORD 6 are the
5052 * upper 20 bits of the GPU address of the MCS buffer; the lower 12 bits
5053 * contain other control information. Since buffer addresses are always
5054 * on 4k boundaries (and thus have their lower 12 bits zero), we can use
5055 * an ordinary reloc to do the necessary address translation.
5057 * FIXME: move to the point of assignment.
5060 uint64_t *aux_addr = (uint64_t *)(surf_state + (isl_dev->ss.aux_addr_offset / 4));
5061 *aux_addr = crocus_state_reloc(batch,
5062 addr_offset + isl_dev->ss.aux_addr_offset,
5066 uint32_t *aux_addr = surf_state + (isl_dev->ss.aux_addr_offset / 4);
5067 *aux_addr = crocus_state_reloc(batch,
5068 addr_offset + isl_dev->ss.aux_addr_offset,
5077 emit_surface(struct crocus_batch *batch,
5078 struct crocus_surface *surf,
5079 enum isl_aux_usage aux_usage,
5081 uint32_t write_disables)
5083 struct isl_device *isl_dev = &batch->screen->isl_dev;
5084 struct crocus_resource *res = (struct crocus_resource *)surf->base.texture;
5085 struct isl_view *view = &surf->view;
5086 uint32_t offset = 0;
5087 enum pipe_texture_target target = res->base.b.target;
5088 bool adjust_surf = false;
5090 if (GFX_VER == 4 && target == PIPE_TEXTURE_CUBE)
5093 if (surf->align_res)
5094 res = (struct crocus_resource *)surf->align_res;
5096 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset);
5098 emit_surface_state(batch, res, &surf->surf, adjust_surf, view, true,
5099 aux_usage, blend_enable,
5101 surf_state, offset);
5106 emit_rt_surface(struct crocus_batch *batch,
5107 struct crocus_surface *surf,
5108 enum isl_aux_usage aux_usage)
5110 struct isl_device *isl_dev = &batch->screen->isl_dev;
5111 struct crocus_resource *res = (struct crocus_resource *)surf->base.texture;
5112 struct isl_view *view = &surf->read_view;
5113 uint32_t offset = 0;
5114 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size, isl_dev->ss.align, &offset);
5116 emit_surface_state(batch, res, &surf->surf, true, view, false,
5117 aux_usage, 0, false,
5118 surf_state, offset);
5123 emit_grid(struct crocus_context *ice,
5124 struct crocus_batch *batch)
5126 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5127 uint32_t offset = 0;
5128 struct crocus_state_ref *grid_ref = &ice->state.grid_size;
5129 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5130 isl_dev->ss.align, &offset);
5131 isl_buffer_fill_state(isl_dev, surf_state,
5132 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5133 crocus_resource_bo(grid_ref->res),
5137 .format = ISL_FORMAT_RAW,
5139 .mocs = crocus_mocs(crocus_resource_bo(grid_ref->res), isl_dev));
5144 emit_ubo_buffer(struct crocus_context *ice,
5145 struct crocus_batch *batch,
5146 struct pipe_constant_buffer *buffer)
5148 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5149 uint32_t offset = 0;
5151 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5152 isl_dev->ss.align, &offset);
5153 isl_buffer_fill_state(isl_dev, surf_state,
5154 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5155 crocus_resource_bo(buffer->buffer),
5156 buffer->buffer_offset,
5158 .size_B = buffer->buffer_size,
5160 .swizzle = ISL_SWIZZLE_IDENTITY,
5162 .mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev));
5168 emit_ssbo_buffer(struct crocus_context *ice,
5169 struct crocus_batch *batch,
5170 struct pipe_shader_buffer *buffer, bool writeable)
5172 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5173 uint32_t offset = 0;
5174 uint32_t reloc = RELOC_32BIT;
5177 reloc |= RELOC_WRITE;
5178 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5179 isl_dev->ss.align, &offset);
5180 isl_buffer_fill_state(isl_dev, surf_state,
5181 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5182 crocus_resource_bo(buffer->buffer),
5183 buffer->buffer_offset,
5185 .size_B = buffer->buffer_size,
5186 .format = ISL_FORMAT_RAW,
5187 .swizzle = ISL_SWIZZLE_IDENTITY,
5189 .mocs = crocus_mocs(crocus_resource_bo(buffer->buffer), isl_dev));
5195 emit_sampler_view(struct crocus_context *ice,
5196 struct crocus_batch *batch,
5198 struct crocus_sampler_view *isv)
5200 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5201 uint32_t offset = 0;
5203 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5204 isl_dev->ss.align, &offset);
5206 if (isv->base.target == PIPE_BUFFER) {
5207 const struct isl_format_layout *fmtl = isl_format_get_layout(isv->view.format);
5208 const unsigned cpp = isv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
5209 unsigned final_size =
5210 MIN3(isv->base.u.buf.size, isv->res->bo->size - isv->res->offset,
5211 CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp);
5212 isl_buffer_fill_state(isl_dev, surf_state,
5213 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5215 isv->res->offset + isv->base.u.buf.offset, RELOC_32BIT),
5216 .size_B = final_size,
5217 .format = isv->view.format,
5218 .swizzle = isv->view.swizzle,
5220 .mocs = crocus_mocs(isv->res->bo, isl_dev)
5223 enum isl_aux_usage aux_usage =
5224 crocus_resource_texture_aux_usage(isv->res);
5226 emit_surface_state(batch, isv->res, &isv->res->surf, false,
5227 for_gather ? &isv->gather_view : &isv->view,
5228 false, aux_usage, false,
5229 0, surf_state, offset);
5235 emit_image_view(struct crocus_context *ice,
5236 struct crocus_batch *batch,
5237 struct crocus_image_view *iv)
5239 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5240 uint32_t offset = 0;
5242 struct crocus_resource *res = (struct crocus_resource *)iv->base.resource;
5243 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5244 isl_dev->ss.align, &offset);
5245 bool write = iv->base.shader_access & PIPE_IMAGE_ACCESS_WRITE;
5246 uint32_t reloc = RELOC_32BIT | (write ? RELOC_WRITE : 0);
5247 if (res->base.b.target == PIPE_BUFFER) {
5248 const struct isl_format_layout *fmtl = isl_format_get_layout(iv->view.format);
5249 const unsigned cpp = iv->view.format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
5250 unsigned final_size =
5251 MIN3(iv->base.u.buf.size, res->bo->size - res->offset - iv->base.u.buf.offset,
5252 CROCUS_MAX_TEXTURE_BUFFER_SIZE * cpp);
5253 isl_buffer_fill_state(isl_dev, surf_state,
5254 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5256 res->offset + iv->base.u.buf.offset, reloc),
5257 .size_B = final_size,
5258 .format = iv->view.format,
5259 .swizzle = iv->view.swizzle,
5261 .mocs = crocus_mocs(res->bo, isl_dev)
5264 if (iv->view.format == ISL_FORMAT_RAW) {
5265 isl_buffer_fill_state(isl_dev, surf_state,
5266 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5268 res->offset, reloc),
5269 .size_B = res->bo->size - res->offset,
5270 .format = iv->view.format,
5271 .swizzle = iv->view.swizzle,
5273 .mocs = crocus_mocs(res->bo, isl_dev),
5278 emit_surface_state(batch, res,
5279 &res->surf, false, &iv->view,
5281 0, surf_state, offset);
5290 emit_sol_surface(struct crocus_batch *batch,
5291 struct pipe_stream_output_info *so_info,
5294 struct crocus_context *ice = batch->ice;
5296 if (idx >= so_info->num_outputs || !ice->state.streamout_active)
5298 const struct pipe_stream_output *output = &so_info->output[idx];
5299 const int buffer = output->output_buffer;
5300 assert(output->stream == 0);
5302 struct crocus_resource *buf = (struct crocus_resource *)ice->state.so_target[buffer]->buffer;
5303 unsigned stride_dwords = so_info->stride[buffer];
5304 unsigned offset_dwords = ice->state.so_target[buffer]->buffer_offset / 4 + output->dst_offset;
5306 size_t size_dwords = (ice->state.so_target[buffer]->buffer_offset + ice->state.so_target[buffer]->buffer_size) / 4;
5307 unsigned num_vector_components = output->num_components;
5308 unsigned num_elements;
5309 /* FIXME: can we rely on core Mesa to ensure that the buffer isn't
5310 * too big to map using a single binding table entry?
5312 // assert((size_dwords - offset_dwords) / stride_dwords
5313 // <= BRW_MAX_NUM_BUFFER_ENTRIES);
5315 if (size_dwords > offset_dwords + num_vector_components) {
5316 /* There is room for at least 1 transform feedback output in the buffer.
5317 * Compute the number of additional transform feedback outputs the
5318 * buffer has room for.
5321 (size_dwords - offset_dwords - num_vector_components);
5323 /* There isn't even room for a single transform feedback output in the
5324 * buffer. We can't configure the binding table entry to prevent output
5325 * entirely; we'll have to rely on the geometry shader to detect
5326 * overflow. But to minimize the damage in case of a bug, set up the
5327 * binding table entry to just allow a single output.
5331 num_elements += stride_dwords;
5333 uint32_t surface_format;
5334 switch (num_vector_components) {
5336 surface_format = ISL_FORMAT_R32_FLOAT;
5339 surface_format = ISL_FORMAT_R32G32_FLOAT;
5342 surface_format = ISL_FORMAT_R32G32B32_FLOAT;
5345 surface_format = ISL_FORMAT_R32G32B32A32_FLOAT;
5348 unreachable("Invalid vector size for transform feedback output");
5351 UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5352 uint32_t offset = 0;
5354 uint32_t *surf_state = stream_state(batch, isl_dev->ss.size,
5355 isl_dev->ss.align, &offset);
5356 isl_buffer_fill_state(isl_dev, surf_state,
5357 .address = crocus_state_reloc(batch, offset + isl_dev->ss.addr_offset,
5358 crocus_resource_bo(&buf->base.b),
5359 offset_dwords * 4, RELOC_32BIT|RELOC_WRITE),
5360 .size_B = num_elements * 4,
5361 .stride_B = stride_dwords * 4,
5362 .swizzle = ISL_SWIZZLE_IDENTITY,
5363 .format = surface_format);
5368 #define foreach_surface_used(index, group) \
5369 for (int index = 0; index < bt->sizes[group]; index++) \
5370 if (crocus_group_index_to_bti(bt, group, index) != \
5371 CROCUS_SURFACE_NOT_USED)
5374 crocus_populate_binding_table(struct crocus_context *ice,
5375 struct crocus_batch *batch,
5376 gl_shader_stage stage, bool ff_gs)
5378 struct crocus_compiled_shader *shader = ff_gs ? ice->shaders.ff_gs_prog : ice->shaders.prog[stage];
5379 struct crocus_shader_state *shs = ff_gs ? NULL : &ice->state.shaders[stage];
5383 struct crocus_binding_table *bt = &shader->bt;
5385 uint32_t *surf_offsets = shader->surf_offset;
5388 const struct shader_info *info = crocus_get_shader_info(ice, stage);
5391 if (stage == MESA_SHADER_FRAGMENT) {
5392 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5393 /* Note that cso_fb->nr_cbufs == fs_key->nr_color_regions. */
5394 if (cso_fb->nr_cbufs) {
5395 for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
5396 uint32_t write_disables = 0;
5397 bool blend_enable = false;
5399 const struct pipe_rt_blend_state *rt =
5400 &ice->state.cso_blend->cso.rt[ice->state.cso_blend->cso.independent_blend_enable ? i : 0];
5401 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
5402 struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
5403 write_disables |= (rt->colormask & PIPE_MASK_A) ? 0x0 : 0x8;
5404 write_disables |= (rt->colormask & PIPE_MASK_R) ? 0x0 : 0x4;
5405 write_disables |= (rt->colormask & PIPE_MASK_G) ? 0x0 : 0x2;
5406 write_disables |= (rt->colormask & PIPE_MASK_B) ? 0x0 : 0x1;
5407 /* Gen4/5 can't handle blending off when a dual src blend wm is enabled. */
5408 blend_enable = rt->blend_enable || wm_prog_data->dual_src_blend;
5410 if (cso_fb->cbufs[i]) {
5411 surf_offsets[s] = emit_surface(batch,
5412 (struct crocus_surface *)cso_fb->cbufs[i],
5413 ice->state.draw_aux_usage[i],
5417 emit_null_fb_surface(batch, ice, &surf_offsets[s]);
5422 emit_null_fb_surface(batch, ice, &surf_offsets[s]);
5426 foreach_surface_used(i, CROCUS_SURFACE_GROUP_RENDER_TARGET_READ) {
5427 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5428 if (cso_fb->cbufs[i]) {
5429 surf_offsets[s++] = emit_rt_surface(batch,
5430 (struct crocus_surface *)cso_fb->cbufs[i],
5431 ice->state.draw_aux_usage[i]);
5436 if (stage == MESA_SHADER_COMPUTE) {
5437 foreach_surface_used(i, CROCUS_SURFACE_GROUP_CS_WORK_GROUPS) {
5438 surf_offsets[s] = emit_grid(ice, batch);
5444 if (stage == MESA_SHADER_GEOMETRY) {
5445 struct pipe_stream_output_info *so_info;
5446 if (ice->shaders.uncompiled[MESA_SHADER_GEOMETRY])
5447 so_info = &ice->shaders.uncompiled[MESA_SHADER_GEOMETRY]->stream_output;
5449 so_info = &ice->shaders.uncompiled[MESA_SHADER_VERTEX]->stream_output;
5451 foreach_surface_used(i, CROCUS_SURFACE_GROUP_SOL) {
5452 surf_offsets[s] = emit_sol_surface(batch, so_info, i);
5458 foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE) {
5459 struct crocus_sampler_view *view = shs->textures[i];
5461 surf_offsets[s] = emit_sampler_view(ice, batch, false, view);
5463 emit_null_surface(batch, &surf_offsets[s]);
5468 if (info && info->uses_texture_gather) {
5469 foreach_surface_used(i, CROCUS_SURFACE_GROUP_TEXTURE_GATHER) {
5470 struct crocus_sampler_view *view = shs->textures[i];
5472 surf_offsets[s] = emit_sampler_view(ice, batch, true, view);
5474 emit_null_surface(batch, &surf_offsets[s]);
5480 foreach_surface_used(i, CROCUS_SURFACE_GROUP_IMAGE) {
5481 struct crocus_image_view *view = &shs->image[i];
5482 if (view->base.resource)
5483 surf_offsets[s] = emit_image_view(ice, batch, view);
5485 emit_null_surface(batch, &surf_offsets[s]);
5488 foreach_surface_used(i, CROCUS_SURFACE_GROUP_UBO) {
5489 if (shs->constbufs[i].buffer)
5490 surf_offsets[s] = emit_ubo_buffer(ice, batch, &shs->constbufs[i]);
5492 emit_null_surface(batch, &surf_offsets[s]);
5495 foreach_surface_used(i, CROCUS_SURFACE_GROUP_SSBO) {
5496 if (shs->ssbo[i].buffer)
5497 surf_offsets[s] = emit_ssbo_buffer(ice, batch, &shs->ssbo[i],
5498 !!(shs->writable_ssbos & (1 << i)));
5500 emit_null_surface(batch, &surf_offsets[s]);
5505 /* ------------------------------------------------------------------- */
5507 crocus_upload_binding_table(struct crocus_context *ice,
5508 struct crocus_batch *batch,
5515 return emit_state(batch, table, size, 32);
5519 * Possibly emit STATE_BASE_ADDRESS to update Surface State Base Address.
5523 crocus_update_surface_base_address(struct crocus_batch *batch)
5525 if (batch->state_base_address_emitted)
5528 UNUSED uint32_t mocs = batch->screen->isl_dev.mocs.internal;
5530 flush_before_state_base_change(batch);
5532 crocus_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
5533 /* Set base addresses */
5534 sba.GeneralStateBaseAddressModifyEnable = true;
5537 sba.DynamicStateBaseAddressModifyEnable = true;
5538 sba.DynamicStateBaseAddress = ro_bo(batch->state.bo, 0);
5541 sba.SurfaceStateBaseAddressModifyEnable = true;
5542 sba.SurfaceStateBaseAddress = ro_bo(batch->state.bo, 0);
5544 sba.IndirectObjectBaseAddressModifyEnable = true;
5547 sba.InstructionBaseAddressModifyEnable = true;
5548 sba.InstructionBaseAddress = ro_bo(batch->ice->shaders.cache_bo, 0); // TODO!
5551 /* Set buffer sizes on Gen8+ or upper bounds on Gen4-7 */
5553 sba.GeneralStateBufferSize = 0xfffff;
5554 sba.IndirectObjectBufferSize = 0xfffff;
5555 sba.InstructionBufferSize = 0xfffff;
5556 sba.DynamicStateBufferSize = MAX_STATE_SIZE;
5558 sba.GeneralStateBufferSizeModifyEnable = true;
5559 sba.DynamicStateBufferSizeModifyEnable = true;
5560 sba.IndirectObjectBufferSizeModifyEnable = true;
5561 sba.InstructionBuffersizeModifyEnable = true;
5563 sba.GeneralStateAccessUpperBoundModifyEnable = true;
5564 sba.IndirectObjectAccessUpperBoundModifyEnable = true;
5567 sba.InstructionAccessUpperBoundModifyEnable = true;
5571 /* Dynamic state upper bound. Although the documentation says that
5572 * programming it to zero will cause it to be ignored, that is a lie.
5573 * If this isn't programmed to a real bound, the sampler border color
5574 * pointer is rejected, causing border color to mysteriously fail.
5576 sba.DynamicStateAccessUpperBound = ro_bo(NULL, 0xfffff000);
5577 sba.DynamicStateAccessUpperBoundModifyEnable = true;
5579 /* Same idea but using General State Base Address on Gen4-5 */
5580 sba.GeneralStateAccessUpperBound = ro_bo(NULL, 0xfffff000);
5585 /* The hardware appears to pay attention to the MOCS fields even
5586 * if you don't set the "Address Modify Enable" bit for the base.
5588 sba.GeneralStateMOCS = mocs;
5589 sba.StatelessDataPortAccessMOCS = mocs;
5590 sba.DynamicStateMOCS = mocs;
5591 sba.IndirectObjectMOCS = mocs;
5592 sba.InstructionMOCS = mocs;
5593 sba.SurfaceStateMOCS = mocs;
5597 flush_after_state_base_change(batch);
5599 /* According to section 3.6.1 of VOL1 of the 965 PRM,
5600 * STATE_BASE_ADDRESS updates require a reissue of:
5602 * 3DSTATE_PIPELINE_POINTERS
5603 * 3DSTATE_BINDING_TABLE_POINTERS
5604 * MEDIA_STATE_POINTERS
5606 * and this continues through Ironlake. The Sandy Bridge PRM, vol
5607 * 1 part 1 says that the folowing packets must be reissued:
5609 * 3DSTATE_CC_POINTERS
5610 * 3DSTATE_BINDING_TABLE_POINTERS
5611 * 3DSTATE_SAMPLER_STATE_POINTERS
5612 * 3DSTATE_VIEWPORT_STATE_POINTERS
5613 * MEDIA_STATE_POINTERS
5615 * Those are always reissued following SBA updates anyway (new
5616 * batch time), except in the case of the program cache BO
5617 * changing. Having a separate state flag makes the sequence more
5621 batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS | CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
5623 batch->ice->state.dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS | CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS;
5625 batch->state_base_address_emitted = true;
5629 crocus_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
5630 bool window_space_position, float *zmin, float *zmax)
5632 if (window_space_position) {
5637 util_viewport_zmin_zmax(vp, halfz, zmin, zmax);
5642 struct crocus_address addr;
5646 uint32_t max_length;
5651 setup_constant_buffers(struct crocus_context *ice,
5652 struct crocus_batch *batch,
5654 struct push_bos *push_bos)
5656 struct crocus_shader_state *shs = &ice->state.shaders[stage];
5657 struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
5658 struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;
5660 uint32_t push_range_sum = 0;
5663 for (int i = 0; i < 4; i++) {
5664 const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
5666 if (range->length == 0)
5669 push_range_sum += range->length;
5671 if (range->length > push_bos->max_length)
5672 push_bos->max_length = range->length;
5674 /* Range block is a binding table index, map back to UBO index. */
5675 unsigned block_index = crocus_bti_to_group_index(
5676 &shader->bt, CROCUS_SURFACE_GROUP_UBO, range->block);
5677 assert(block_index != CROCUS_SURFACE_NOT_USED);
5679 struct pipe_constant_buffer *cbuf = &shs->constbufs[block_index];
5680 struct crocus_resource *res = (void *) cbuf->buffer;
5682 assert(cbuf->buffer_offset % 32 == 0);
5684 push_bos->buffers[n].length = range->length;
5685 push_bos->buffers[n].addr =
5686 res ? ro_bo(res->bo, range->start * 32 + cbuf->buffer_offset)
5687 : ro_bo(batch->ice->workaround_bo,
5688 batch->ice->workaround_offset);
5692 /* From the 3DSTATE_CONSTANT_XS and 3DSTATE_CONSTANT_ALL programming notes:
5694 * "The sum of all four read length fields must be less than or
5695 * equal to the size of 64."
5697 assert(push_range_sum <= 64);
5699 push_bos->buffer_count = n;
5704 gen7_emit_vs_workaround_flush(struct crocus_batch *batch)
5706 crocus_emit_pipe_control_write(batch,
5708 PIPE_CONTROL_WRITE_IMMEDIATE
5709 | PIPE_CONTROL_DEPTH_STALL,
5710 batch->ice->workaround_bo,
5711 batch->ice->workaround_offset, 0);
5716 emit_push_constant_packets(struct crocus_context *ice,
5717 struct crocus_batch *batch,
5719 const struct push_bos *push_bos)
5721 struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
5722 struct brw_stage_prog_data *prog_data = shader ? (void *) shader->prog_data : NULL;
5723 UNUSED uint32_t mocs = crocus_mocs(NULL, &batch->screen->isl_dev);
5726 if (stage == MESA_SHADER_VERTEX) {
5727 if (batch->screen->devinfo.platform == INTEL_PLATFORM_IVB)
5728 gen7_emit_vs_workaround_flush(batch);
5731 crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_VS), pkt) {
5732 pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
5735 /* MOCS is MBZ on Gen8 so we skip it there */
5736 pkt.ConstantBody.MOCS = mocs;
5740 /* The Skylake PRM contains the following restriction:
5742 * "The driver must ensure The following case does not occur
5743 * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
5744 * buffer 3 read length equal to zero committed followed by a
5745 * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to
5748 * To avoid this, we program the buffers in the highest slots.
5749 * This way, slot 0 is only used if slot 3 is also used.
5751 int n = push_bos->buffer_count;
5753 #if GFX_VERx10 >= 75
5754 const unsigned shift = 4 - n;
5756 const unsigned shift = 0;
5758 for (int i = 0; i < n; i++) {
5759 pkt.ConstantBody.ReadLength[i + shift] =
5760 push_bos->buffers[i].length;
5761 pkt.ConstantBody.Buffer[i + shift] = push_bos->buffers[i].addr;
5766 int n = push_bos->buffer_count;
5769 pkt.Buffer0Valid = true;
5770 pkt.ConstantBody.PointertoConstantBuffer0 = push_bos->buffers[0].addr.offset;
5771 pkt.ConstantBody.ConstantBuffer0ReadLength = push_bos->buffers[0].length - 1;
5781 typedef struct GENX(3DSTATE_WM_DEPTH_STENCIL) DEPTH_STENCIL_GENXML;
5783 typedef struct GENX(DEPTH_STENCIL_STATE) DEPTH_STENCIL_GENXML;
5785 typedef struct GENX(COLOR_CALC_STATE) DEPTH_STENCIL_GENXML;
5789 set_depth_stencil_bits(struct crocus_context *ice, DEPTH_STENCIL_GENXML *ds)
5791 struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
5792 ds->DepthTestEnable = cso->cso.depth_enabled;
5793 ds->DepthBufferWriteEnable = cso->cso.depth_writemask;
5794 ds->DepthTestFunction = translate_compare_func(cso->cso.depth_func);
5796 ds->StencilFailOp = cso->cso.stencil[0].fail_op;
5797 ds->StencilPassDepthFailOp = cso->cso.stencil[0].zfail_op;
5798 ds->StencilPassDepthPassOp = cso->cso.stencil[0].zpass_op;
5799 ds->StencilTestFunction = translate_compare_func(cso->cso.stencil[0].func);
5801 ds->StencilTestMask = cso->cso.stencil[0].valuemask;
5802 ds->StencilWriteMask = cso->cso.stencil[0].writemask;
5804 ds->BackfaceStencilFailOp = cso->cso.stencil[1].fail_op;
5805 ds->BackfaceStencilPassDepthFailOp = cso->cso.stencil[1].zfail_op;
5806 ds->BackfaceStencilPassDepthPassOp = cso->cso.stencil[1].zpass_op;
5807 ds->BackfaceStencilTestFunction = translate_compare_func(cso->cso.stencil[1].func);
5809 ds->BackfaceStencilTestMask = cso->cso.stencil[1].valuemask;
5810 ds->BackfaceStencilWriteMask = cso->cso.stencil[1].writemask;
5811 ds->DoubleSidedStencilEnable = cso->cso.stencil[1].enabled;
5812 ds->StencilTestEnable = cso->cso.stencil[0].enabled;
5813 ds->StencilBufferWriteEnable =
5814 cso->cso.stencil[0].writemask != 0 ||
5815 (cso->cso.stencil[1].enabled && cso->cso.stencil[1].writemask != 0);
5819 emit_vertex_buffer_state(struct crocus_batch *batch,
5821 struct crocus_bo *bo,
5822 unsigned start_offset,
5823 unsigned end_offset,
5828 const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
5829 _crocus_pack_state(batch, GENX(VERTEX_BUFFER_STATE), *map, vb) {
5830 vb.BufferStartingAddress = ro_bo(bo, start_offset);
5832 vb.BufferSize = end_offset - start_offset;
5834 vb.VertexBufferIndex = buffer_id;
5835 vb.BufferPitch = stride;
5837 vb.AddressModifyEnable = true;
5840 vb.MOCS = crocus_mocs(bo, &batch->screen->isl_dev);
5843 vb.BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA;
5844 vb.InstanceDataStepRate = step_rate;
5846 vb.EndAddress = ro_bo(bo, end_offset - 1);
5855 determine_sample_mask(struct crocus_context *ice)
5857 uint32_t num_samples = ice->state.framebuffer.samples;
5859 if (num_samples <= 1)
5862 uint32_t fb_mask = (1 << num_samples) - 1;
5863 return ice->state.sample_mask & fb_mask;
5868 crocus_upload_dirty_render_state(struct crocus_context *ice,
5869 struct crocus_batch *batch,
5870 const struct pipe_draw_info *draw)
5872 uint64_t dirty = ice->state.dirty;
5873 uint64_t stage_dirty = ice->state.stage_dirty;
5875 if (!(dirty & CROCUS_ALL_DIRTY_FOR_RENDER) &&
5876 !(stage_dirty & CROCUS_ALL_STAGE_DIRTY_FOR_RENDER))
5879 if (dirty & CROCUS_DIRTY_VF_STATISTICS) {
5880 crocus_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
5881 vf.StatisticsEnable = true;
5886 if (stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS |
5887 CROCUS_STAGE_DIRTY_CONSTANTS_FS)) {
5888 bool ret = calculate_curbe_offsets(batch);
5890 dirty |= CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_WM | CROCUS_DIRTY_CLIP;
5891 stage_dirty |= CROCUS_STAGE_DIRTY_VS;
5895 if (dirty & (CROCUS_DIRTY_GEN4_CURBE | CROCUS_DIRTY_RASTER) ||
5896 stage_dirty & CROCUS_STAGE_DIRTY_VS) {
5897 bool ret = crocus_calculate_urb_fence(batch, ice->curbe.total_size,
5898 brw_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->urb_entry_size,
5899 ((struct brw_sf_prog_data *)ice->shaders.sf_prog->prog_data)->urb_entry_size);
5901 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS | CROCUS_DIRTY_RASTER | CROCUS_DIRTY_CLIP;
5902 stage_dirty |= CROCUS_STAGE_DIRTY_GS | CROCUS_STAGE_DIRTY_VS;
5906 if (dirty & CROCUS_DIRTY_CC_VIEWPORT) {
5907 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
5908 uint32_t cc_vp_address;
5910 /* XXX: could avoid streaming for depth_clip [0,1] case. */
5911 uint32_t *cc_vp_map =
5913 4 * ice->state.num_viewports *
5914 GENX(CC_VIEWPORT_length), 32, &cc_vp_address);
5915 for (int i = 0; i < ice->state.num_viewports; i++) {
5917 crocus_viewport_zmin_zmax(&ice->state.viewports[i], cso_rast->cso.clip_halfz,
5918 ice->state.window_space_position,
5920 if (cso_rast->cso.depth_clip_near)
5922 if (cso_rast->cso.depth_clip_far)
5925 crocus_pack_state(GENX(CC_VIEWPORT), cc_vp_map, ccv) {
5926 ccv.MinimumDepth = zmin;
5927 ccv.MaximumDepth = zmax;
5930 cc_vp_map += GENX(CC_VIEWPORT_length);
5934 crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
5935 ptr.CCViewportPointer = cc_vp_address;
5938 crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
5939 vp.CCViewportStateChange = 1;
5940 vp.PointertoCC_VIEWPORT = cc_vp_address;
5943 ice->state.cc_vp_address = cc_vp_address;
5944 dirty |= CROCUS_DIRTY_COLOR_CALC_STATE;
5948 if (dirty & CROCUS_DIRTY_SF_CL_VIEWPORT) {
5949 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5951 uint32_t sf_cl_vp_address;
5954 4 * ice->state.num_viewports *
5955 GENX(SF_CLIP_VIEWPORT_length), 64, &sf_cl_vp_address);
5959 4 * ice->state.num_viewports * GENX(SF_VIEWPORT_length),
5960 32, &ice->state.sf_vp_address);
5961 uint32_t *clip_map =
5963 4 * ice->state.num_viewports * GENX(CLIP_VIEWPORT_length),
5964 32, &ice->state.clip_vp_address);
5967 for (unsigned i = 0; i < ice->state.num_viewports; i++) {
5968 const struct pipe_viewport_state *state = &ice->state.viewports[i];
5969 float gb_xmin, gb_xmax, gb_ymin, gb_ymax;
5972 float vp_xmin = viewport_extent(state, 0, -1.0f);
5973 float vp_xmax = viewport_extent(state, 0, 1.0f);
5974 float vp_ymin = viewport_extent(state, 1, -1.0f);
5975 float vp_ymax = viewport_extent(state, 1, 1.0f);
5977 intel_calculate_guardband_size(0, cso_fb->width, 0, cso_fb->height,
5978 state->scale[0], state->scale[1],
5979 state->translate[0], state->translate[1],
5980 &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
5982 crocus_pack_state(GENX(SF_CLIP_VIEWPORT), vp_map, vp)
5984 crocus_pack_state(GENX(SF_VIEWPORT), vp_map, vp)
5987 vp.ViewportMatrixElementm00 = state->scale[0];
5988 vp.ViewportMatrixElementm11 = state->scale[1];
5989 vp.ViewportMatrixElementm22 = state->scale[2];
5990 vp.ViewportMatrixElementm30 = state->translate[0];
5991 vp.ViewportMatrixElementm31 = state->translate[1];
5992 vp.ViewportMatrixElementm32 = state->translate[2];
5994 struct pipe_scissor_state scissor;
5995 crocus_fill_scissor_rect(ice, 0, &scissor);
5996 vp.ScissorRectangle.ScissorRectangleXMin = scissor.minx;
5997 vp.ScissorRectangle.ScissorRectangleXMax = scissor.maxx;
5998 vp.ScissorRectangle.ScissorRectangleYMin = scissor.miny;
5999 vp.ScissorRectangle.ScissorRectangleYMax = scissor.maxy;
6003 vp.XMinClipGuardband = gb_xmin;
6004 vp.XMaxClipGuardband = gb_xmax;
6005 vp.YMinClipGuardband = gb_ymin;
6006 vp.YMaxClipGuardband = gb_ymax;
6009 vp.XMinViewPort = MAX2(vp_xmin, 0);
6010 vp.XMaxViewPort = MIN2(vp_xmax, cso_fb->width) - 1;
6011 vp.YMinViewPort = MAX2(vp_ymin, 0);
6012 vp.YMaxViewPort = MIN2(vp_ymax, cso_fb->height) - 1;
6016 crocus_pack_state(GENX(CLIP_VIEWPORT), clip_map, clip) {
6017 clip.XMinClipGuardband = gb_xmin;
6018 clip.XMaxClipGuardband = gb_xmax;
6019 clip.YMinClipGuardband = gb_ymin;
6020 clip.YMaxClipGuardband = gb_ymax;
6024 vp_map += GENX(SF_CLIP_VIEWPORT_length);
6026 vp_map += GENX(SF_VIEWPORT_length);
6027 clip_map += GENX(CLIP_VIEWPORT_length);
6031 crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
6032 ptr.SFClipViewportPointer = sf_cl_vp_address;
6035 crocus_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
6036 vp.SFViewportStateChange = 1;
6037 vp.CLIPViewportStateChange = 1;
6038 vp.PointertoCLIP_VIEWPORT = ice->state.clip_vp_address;
6039 vp.PointertoSF_VIEWPORT = ice->state.sf_vp_address;
6045 if (dirty & CROCUS_DIRTY_GEN6_URB) {
6047 bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL
6048 || ice->shaders.ff_gs_prog;
6050 struct brw_vue_prog_data *vue_prog_data =
6051 (void *) ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data;
6052 const unsigned vs_size = vue_prog_data->urb_entry_size;
6053 unsigned gs_size = vs_size;
6054 if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
6055 struct brw_vue_prog_data *gs_vue_prog_data =
6056 (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
6057 gs_size = gs_vue_prog_data->urb_entry_size;
6060 genX(crocus_upload_urb)(batch, vs_size, gs_present, gs_size);
6063 const struct intel_device_info *devinfo = &batch->screen->devinfo;
6064 bool gs_present = ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL;
6065 bool tess_present = ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL;
6066 unsigned entry_size[4];
6068 for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
6069 if (!ice->shaders.prog[i]) {
6072 struct brw_vue_prog_data *vue_prog_data =
6073 (void *) ice->shaders.prog[i]->prog_data;
6074 entry_size[i] = vue_prog_data->urb_entry_size;
6076 assert(entry_size[i] != 0);
6079 /* If we're just switching between programs with the same URB requirements,
6080 * skip the rest of the logic.
6082 bool no_change = false;
6083 if (ice->urb.vsize == entry_size[MESA_SHADER_VERTEX] &&
6084 ice->urb.gs_present == gs_present &&
6085 ice->urb.gsize == entry_size[MESA_SHADER_GEOMETRY] &&
6086 ice->urb.tess_present == tess_present &&
6087 ice->urb.hsize == entry_size[MESA_SHADER_TESS_CTRL] &&
6088 ice->urb.dsize == entry_size[MESA_SHADER_TESS_EVAL]) {
6093 ice->urb.vsize = entry_size[MESA_SHADER_VERTEX];
6094 ice->urb.gs_present = gs_present;
6095 ice->urb.gsize = entry_size[MESA_SHADER_GEOMETRY];
6096 ice->urb.tess_present = tess_present;
6097 ice->urb.hsize = entry_size[MESA_SHADER_TESS_CTRL];
6098 ice->urb.dsize = entry_size[MESA_SHADER_TESS_EVAL];
6100 unsigned entries[4];
6103 intel_get_urb_config(devinfo,
6104 batch->screen->l3_config_3d,
6108 entries, start, NULL, &constrained);
6111 if (devinfo->platform == INTEL_PLATFORM_IVB)
6112 gen7_emit_vs_workaround_flush(batch);
6114 for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
6115 crocus_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {
6116 urb._3DCommandSubOpcode += i;
6117 urb.VSURBStartingAddress = start[i];
6118 urb.VSURBEntryAllocationSize = entry_size[i] - 1;
6119 urb.VSNumberofURBEntries = entries[i];
6126 if (dirty & CROCUS_DIRTY_GEN6_BLEND_STATE) {
6127 struct crocus_blend_state *cso_blend = ice->state.cso_blend;
6128 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6129 struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
6131 STATIC_ASSERT(GENX(BLEND_STATE_ENTRY_length) == 2);
6133 MAX2(cso_fb->nr_cbufs, 1) * GENX(BLEND_STATE_ENTRY_length);
6135 rt_dwords += GENX(BLEND_STATE_length);
6137 uint32_t blend_offset;
6138 uint32_t *blend_map =
6140 4 * rt_dwords, 64, &blend_offset);
6143 struct GENX(BLEND_STATE) be = { 0 };
6146 for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) {
6147 struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
6151 be.AlphaTestEnable = cso_zsa->cso.alpha_enabled;
6152 be.AlphaTestFunction = translate_compare_func(cso_zsa->cso.alpha_func);
6153 be.AlphaToCoverageEnable = cso_blend->cso.alpha_to_coverage;
6154 be.AlphaToOneEnable = cso_blend->cso.alpha_to_one;
6155 be.AlphaToCoverageDitherEnable = GFX_VER >= 7 && cso_blend->cso.alpha_to_coverage_dither;
6156 be.ColorDitherEnable = cso_blend->cso.dither;
6159 for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) {
6160 struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
6164 const struct pipe_rt_blend_state *rt =
6165 &cso_blend->cso.rt[cso_blend->cso.independent_blend_enable ? i : 0];
6167 be.IndependentAlphaBlendEnable = set_blend_entry_bits(batch, &entry, cso_blend, i) ||
6168 be.IndependentAlphaBlendEnable;
6170 if (GFX_VER >= 8 || can_emit_logic_op(ice)) {
6171 entry.LogicOpEnable = cso_blend->cso.logicop_enable;
6172 entry.LogicOpFunction = cso_blend->cso.logicop_func;
6175 entry.ColorClampRange = COLORCLAMP_RTFORMAT;
6176 entry.PreBlendColorClampEnable = true;
6177 entry.PostBlendColorClampEnable = true;
6179 entry.WriteDisableRed = !(rt->colormask & PIPE_MASK_R);
6180 entry.WriteDisableGreen = !(rt->colormask & PIPE_MASK_G);
6181 entry.WriteDisableBlue = !(rt->colormask & PIPE_MASK_B);
6182 entry.WriteDisableAlpha = !(rt->colormask & PIPE_MASK_A);
6185 GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[1 + i * 2], &entry);
6187 GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[i * 2], &entry);
6192 GENX(BLEND_STATE_pack)(NULL, blend_map, &be);
6195 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
6196 ptr.PointertoBLEND_STATE = blend_offset;
6197 ptr.BLEND_STATEChange = true;
6200 crocus_emit_cmd(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
6201 ptr.BlendStatePointer = blend_offset;
6203 ptr.BlendStatePointerValid = true;
6210 if (dirty & CROCUS_DIRTY_COLOR_CALC_STATE) {
6211 struct crocus_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
6212 UNUSED struct crocus_blend_state *cso_blend = ice->state.cso_blend;
6213 struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
6217 sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length),
6220 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6222 _crocus_pack_state(batch, GENX(COLOR_CALC_STATE), cc_map, cc) {
6223 cc.AlphaTestFormat = ALPHATEST_FLOAT32;
6224 cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value;
6228 set_depth_stencil_bits(ice, &cc);
6230 if (cso_blend->cso.logicop_enable) {
6231 if (can_emit_logic_op(ice)) {
6232 cc.LogicOpEnable = cso_blend->cso.logicop_enable;
6233 cc.LogicOpFunction = cso_blend->cso.logicop_func;
6236 cc.ColorDitherEnable = cso_blend->cso.dither;
6238 cc.IndependentAlphaBlendEnable = set_blend_entry_bits(batch, &cc, cso_blend, 0);
6240 if (cso->cso.alpha_enabled && ice->state.framebuffer.nr_cbufs <= 1) {
6241 cc.AlphaTestEnable = cso->cso.alpha_enabled;
6242 cc.AlphaTestFunction = translate_compare_func(cso->cso.alpha_func);
6244 cc.StatisticsEnable = ice->state.stats_wm ? 1 : 0;
6245 cc.CCViewportStatePointer = ro_bo(batch->state.bo, ice->state.cc_vp_address);
6247 cc.AlphaTestFormat = ALPHATEST_FLOAT32;
6248 cc.AlphaReferenceValueAsFLOAT32 = cso->cso.alpha_ref_value;
6250 cc.BlendConstantColorRed = ice->state.blend_color.color[0];
6251 cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
6252 cc.BlendConstantColorBlue = ice->state.blend_color.color[2];
6253 cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
6255 cc.StencilReferenceValue = p_stencil_refs->ref_value[0];
6256 cc.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
6258 ice->shaders.cc_offset = cc_offset;
6260 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
6261 ptr.ColorCalcStatePointer = cc_offset;
6263 ptr.ColorCalcStatePointerValid = true;
6269 if (dirty & CROCUS_DIRTY_GEN4_CONSTANT_COLOR) {
6270 crocus_emit_cmd(batch, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) {
6271 blend_cc.BlendConstantColorRed = ice->state.blend_color.color[0];
6272 blend_cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
6273 blend_cc.BlendConstantColorBlue = ice->state.blend_color.color[2];
6274 blend_cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
6278 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6279 if (!(stage_dirty & (CROCUS_STAGE_DIRTY_CONSTANTS_VS << stage)))
6282 struct crocus_shader_state *shs = &ice->state.shaders[stage];
6283 struct crocus_compiled_shader *shader = ice->shaders.prog[stage];
6288 if (shs->sysvals_need_upload)
6289 upload_sysvals(ice, stage);
6292 dirty |= CROCUS_DIRTY_GEN4_CURBE;
6295 struct push_bos push_bos = {};
6296 setup_constant_buffers(ice, batch, stage, &push_bos);
6298 emit_push_constant_packets(ice, batch, stage, &push_bos);
6302 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6303 if (stage_dirty & (CROCUS_STAGE_DIRTY_BINDINGS_VS << stage)) {
6304 if (ice->shaders.prog[stage]) {
6306 dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
6308 crocus_populate_binding_table(ice, batch, stage, false);
6309 ice->shaders.prog[stage]->bind_bo_offset =
6310 crocus_upload_binding_table(ice, batch,
6311 ice->shaders.prog[stage]->surf_offset,
6312 ice->shaders.prog[stage]->bt.size_bytes);
6315 crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), ptr) {
6316 ptr._3DCommandSubOpcode = 38 + stage;
6317 ptr.PointertoVSBindingTable = ice->shaders.prog[stage]->bind_bo_offset;
6321 } else if (stage == MESA_SHADER_GEOMETRY && ice->shaders.ff_gs_prog) {
6322 dirty |= CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS;
6323 crocus_populate_binding_table(ice, batch, stage, true);
6324 ice->shaders.ff_gs_prog->bind_bo_offset =
6325 crocus_upload_binding_table(ice, batch,
6326 ice->shaders.ff_gs_prog->surf_offset,
6327 ice->shaders.ff_gs_prog->bt.size_bytes);
6333 if (dirty & CROCUS_DIRTY_GEN5_BINDING_TABLE_POINTERS) {
6334 struct crocus_compiled_shader *gs = ice->shaders.prog[MESA_SHADER_GEOMETRY];
6336 gs = ice->shaders.ff_gs_prog;
6337 crocus_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS), ptr) {
6338 ptr.PointertoVSBindingTable = ice->shaders.prog[MESA_SHADER_VERTEX]->bind_bo_offset;
6339 ptr.PointertoPSBindingTable = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bind_bo_offset;
6341 ptr.VSBindingTableChange = true;
6342 ptr.PSBindingTableChange = true;
6343 ptr.GSBindingTableChange = gs ? true : false;
6344 ptr.PointertoGSBindingTable = gs ? gs->bind_bo_offset : 0;
6350 bool sampler_updates = dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS;
6351 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
6352 if (!(stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << stage)) ||
6353 !ice->shaders.prog[stage])
6356 crocus_upload_sampler_states(ice, batch, stage);
6358 sampler_updates = true;
6361 struct crocus_shader_state *shs = &ice->state.shaders[stage];
6363 crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
6364 ptr._3DCommandSubOpcode = 43 + stage;
6365 ptr.PointertoVSSamplerState = shs->sampler_offset;
6370 if (sampler_updates) {
6372 struct crocus_shader_state *shs_vs = &ice->state.shaders[MESA_SHADER_VERTEX];
6373 struct crocus_shader_state *shs_gs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
6374 struct crocus_shader_state *shs_fs = &ice->state.shaders[MESA_SHADER_FRAGMENT];
6375 crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS), ptr) {
6376 if (ice->shaders.prog[MESA_SHADER_VERTEX] &&
6377 (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6378 stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_VERTEX))) {
6379 ptr.VSSamplerStateChange = true;
6380 ptr.PointertoVSSamplerState = shs_vs->sampler_offset;
6382 if (ice->shaders.prog[MESA_SHADER_GEOMETRY] &&
6383 (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6384 stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_GEOMETRY))) {
6385 ptr.GSSamplerStateChange = true;
6386 ptr.PointertoGSSamplerState = shs_gs->sampler_offset;
6388 if (ice->shaders.prog[MESA_SHADER_FRAGMENT] &&
6389 (dirty & CROCUS_DIRTY_GEN6_SAMPLER_STATE_POINTERS ||
6390 stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS << MESA_SHADER_FRAGMENT))) {
6391 ptr.PSSamplerStateChange = true;
6392 ptr.PointertoPSSamplerState = shs_fs->sampler_offset;
6399 if (dirty & CROCUS_DIRTY_GEN6_MULTISAMPLE) {
6400 crocus_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
6402 ice->state.cso_rast->cso.half_pixel_center ? CENTER : UL_CORNER;
6403 if (ice->state.framebuffer.samples > 0)
6404 ms.NumberofMultisamples = ffs(ice->state.framebuffer.samples) - 1;
6406 INTEL_SAMPLE_POS_4X(ms.Sample);
6408 switch (ice->state.framebuffer.samples) {
6410 INTEL_SAMPLE_POS_1X(ms.Sample);
6413 INTEL_SAMPLE_POS_2X(ms.Sample);
6416 INTEL_SAMPLE_POS_4X(ms.Sample);
6419 INTEL_SAMPLE_POS_8X(ms.Sample);
6428 if (dirty & CROCUS_DIRTY_GEN6_SAMPLE_MASK) {
6429 crocus_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), ms) {
6430 ms.SampleMask = determine_sample_mask(ice);
6436 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
6437 if ((stage_dirty & CROCUS_STAGE_DIRTY_FS) && shader) {
6438 struct brw_stage_prog_data *prog_data = shader->prog_data;
6439 struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
6441 crocus_emit_cmd(batch, GENX(3DSTATE_PS), ps) {
6443 /* Initialize the execution mask with VMask. Otherwise, derivatives are
6444 * incorrect for subspans where some of the pixels are unlit. We believe
6445 * the bit just didn't take effect in previous generations.
6447 ps.VectorMaskEnable = GFX_VER >= 8 && wm_prog_data->uses_vmask;
6449 brw_fs_get_dispatch_enables(&batch->screen->devinfo, wm_prog_data,
6450 ice->state.framebuffer.samples,
6451 &ps._8PixelDispatchEnable,
6452 &ps._16PixelDispatchEnable,
6453 &ps._32PixelDispatchEnable);
6455 ps.DispatchGRFStartRegisterForConstantSetupData0 =
6456 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
6457 ps.DispatchGRFStartRegisterForConstantSetupData1 =
6458 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
6459 ps.DispatchGRFStartRegisterForConstantSetupData2 =
6460 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
6462 ps.KernelStartPointer0 = KSP(ice, shader) +
6463 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
6464 ps.KernelStartPointer1 = KSP(ice, shader) +
6465 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
6466 ps.KernelStartPointer2 = KSP(ice, shader) +
6467 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
6469 #if GFX_VERx10 == 75
6470 ps.SampleMask = determine_sample_mask(ice);
6472 // XXX: WABTPPrefetchDisable, see above, drop at C0
6473 ps.BindingTableEntryCount = shader->bt.size_bytes / 4;
6474 ps.FloatingPointMode = prog_data->use_alt_mode;
6476 ps.MaximumNumberofThreadsPerPSD =
6477 batch->screen->devinfo.max_threads_per_psd - 2;
6479 ps.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1;
6482 ps.PushConstantEnable = prog_data->ubo_ranges[0].length > 0;
6485 ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
6486 ps.DualSourceBlendEnable = wm_prog_data->dual_src_blend && ice->state.cso_blend->dual_color_blending;
6487 ps.AttributeEnable = (wm_prog_data->num_varying_inputs != 0);
6489 /* From the documentation for this packet:
6490 * "If the PS kernel does not need the Position XY Offsets to
6491 * compute a Position Value, then this field should be programmed
6492 * to POSOFFSET_NONE."
6494 * "SW Recommendation: If the PS kernel needs the Position Offsets
6495 * to compute a Position XY value, this field should match Position
6496 * ZW Interpolation Mode to ensure a consistent position.xyzw
6499 * We only require XY sample offsets. So, this recommendation doesn't
6500 * look useful at the moment. We might need this in future.
6502 ps.PositionXYOffsetSelect =
6503 wm_prog_data->uses_pos_offset ? POSOFFSET_SAMPLE : POSOFFSET_NONE;
6505 if (wm_prog_data->base.total_scratch) {
6506 struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch, MESA_SHADER_FRAGMENT);
6507 ps.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11;
6508 ps.ScratchSpaceBasePointer = rw_bo(bo, 0);
6512 const struct shader_info *fs_info =
6513 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
6514 crocus_emit_cmd(batch, GENX(3DSTATE_PS_EXTRA), psx) {
6515 psx.PixelShaderValid = true;
6516 psx.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
6517 psx.PixelShaderKillsPixel = wm_prog_data->uses_kill;
6518 psx.AttributeEnable = wm_prog_data->num_varying_inputs != 0;
6519 psx.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
6520 psx.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
6521 psx.PixelShaderIsPerSample = wm_prog_data->persample_dispatch;
6523 /* _NEW_MULTISAMPLE | BRW_NEW_CONSERVATIVE_RASTERIZATION */
6524 if (wm_prog_data->uses_sample_mask)
6525 psx.PixelShaderUsesInputCoverageMask = true;
6527 psx.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
6529 /* The stricter cross-primitive coherency guarantees that the hardware
6530 * gives us with the "Accesses UAV" bit set for at least one shader stage
6531 * and the "UAV coherency required" bit set on the 3DPRIMITIVE command
6532 * are redundant within the current image, atomic counter and SSBO GL
6533 * APIs, which all have very loose ordering and coherency requirements
6534 * and generally rely on the application to insert explicit barriers when
6535 * a shader invocation is expected to see the memory writes performed by
6536 * the invocations of some previous primitive. Regardless of the value
6537 * of "UAV coherency required", the "Accesses UAV" bits will implicitly
6538 * cause an in most cases useless DC flush when the lowermost stage with
6539 * the bit set finishes execution.
6541 * It would be nice to disable it, but in some cases we can't because on
6542 * Gfx8+ it also has an influence on rasterization via the PS UAV-only
6543 * signal (which could be set independently from the coherency mechanism
6544 * in the 3DSTATE_WM command on Gfx7), and because in some cases it will
6545 * determine whether the hardware skips execution of the fragment shader
6546 * or not via the ThreadDispatchEnable signal. However if we know that
6547 * GFX8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and
6548 * GFX8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
6549 * difference so we may just disable it here.
6551 * Gfx8 hardware tries to compute ThreadDispatchEnable for us but doesn't
6552 * take into account KillPixels when no depth or stencil writes are
6553 * enabled. In order for occlusion queries to work correctly with no
6554 * attachments, we need to force-enable here.
6557 if ((wm_prog_data->has_side_effects || wm_prog_data->uses_kill) &&
6558 !(has_writeable_rt(ice->state.cso_blend, fs_info)))
6559 psx.PixelShaderHasUAV = true;
6566 if (ice->state.streamout_active) {
6567 if (dirty & CROCUS_DIRTY_GEN7_SO_BUFFERS) {
6568 for (int i = 0; i < 4; i++) {
6569 struct crocus_stream_output_target *tgt =
6570 (void *) ice->state.so_target[i];
6573 crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) {
6574 sob.SOBufferIndex = i;
6575 sob.MOCS = crocus_mocs(NULL, &batch->screen->isl_dev);
6579 struct crocus_resource *res = (void *) tgt->base.buffer;
6580 uint32_t start = tgt->base.buffer_offset;
6582 uint32_t end = ALIGN(start + tgt->base.buffer_size, 4);
6584 crocus_emit_cmd(batch, GENX(3DSTATE_SO_BUFFER), sob) {
6585 sob.SOBufferIndex = i;
6587 sob.SurfaceBaseAddress = rw_bo(res->bo, start);
6588 sob.MOCS = crocus_mocs(res->bo, &batch->screen->isl_dev);
6590 sob.SurfacePitch = tgt->stride;
6591 sob.SurfaceEndAddress = rw_bo(res->bo, end);
6593 sob.SOBufferEnable = true;
6594 sob.StreamOffsetWriteEnable = true;
6595 sob.StreamOutputBufferOffsetAddressEnable = true;
6597 sob.SurfaceSize = MAX2(tgt->base.buffer_size / 4, 1) - 1;
6598 sob.StreamOutputBufferOffsetAddress =
6599 rw_bo(crocus_resource_bo(&tgt->offset_res->base.b), tgt->offset_offset);
6600 if (tgt->zero_offset) {
6601 sob.StreamOffset = 0;
6602 tgt->zero_offset = false;
6604 sob.StreamOffset = 0xFFFFFFFF; /* not offset, see above */
6610 if ((dirty & CROCUS_DIRTY_SO_DECL_LIST) && ice->state.streamout) {
6611 uint32_t *decl_list =
6612 ice->state.streamout + GENX(3DSTATE_STREAMOUT_length);
6613 crocus_batch_emit(batch, decl_list, 4 * ((decl_list[0] & 0xff) + 2));
6616 if (dirty & CROCUS_DIRTY_STREAMOUT) {
6617 const struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
6619 uint32_t dynamic_sol[GENX(3DSTATE_STREAMOUT_length)];
6620 crocus_pack_command(GENX(3DSTATE_STREAMOUT), dynamic_sol, sol) {
6621 sol.SOFunctionEnable = true;
6622 sol.SOStatisticsEnable = true;
6624 sol.RenderingDisable = cso_rast->cso.rasterizer_discard &&
6625 !ice->state.prims_generated_query_active;
6626 sol.ReorderMode = cso_rast->cso.flatshade_first ? LEADING : TRAILING;
6629 assert(ice->state.streamout);
6631 crocus_emit_merge(batch, ice->state.streamout, dynamic_sol,
6632 GENX(3DSTATE_STREAMOUT_length));
6635 if (dirty & CROCUS_DIRTY_STREAMOUT) {
6636 crocus_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), sol);
6641 if (ice->state.streamout_active) {
6642 if (dirty & CROCUS_DIRTY_GEN6_SVBI) {
6643 crocus_emit_so_svbi(ice);
6648 if (dirty & CROCUS_DIRTY_CLIP) {
6650 const struct brw_clip_prog_data *clip_prog_data = (struct brw_clip_prog_data *)ice->shaders.clip_prog->prog_data;
6651 struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
6653 uint32_t *clip_ptr = stream_state(batch, GENX(CLIP_STATE_length) * 4, 32, &ice->shaders.clip_offset);
6654 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6655 _crocus_pack_state(batch, GENX(CLIP_STATE), clip_ptr, clip) {
6656 clip.KernelStartPointer = KSP(ice, ice->shaders.clip_prog);
6657 clip.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
6658 clip.SingleProgramFlow = true;
6659 clip.GRFRegisterCount = DIV_ROUND_UP(clip_prog_data->total_grf, 16) - 1;
6661 clip.VertexURBEntryReadLength = clip_prog_data->urb_read_length;
6662 clip.ConstantURBEntryReadLength = clip_prog_data->curb_read_length;
6664 clip.DispatchGRFStartRegisterForURBData = 1;
6665 clip.VertexURBEntryReadOffset = 0;
6666 clip.ConstantURBEntryReadOffset = ice->curbe.clip_start * 2;
6668 clip.NumberofURBEntries = batch->ice->urb.nr_clip_entries;
6669 clip.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6671 if (batch->ice->urb.nr_clip_entries >= 10) {
6672 /* Half of the URB entries go to each thread, and it has to be an
6675 assert(batch->ice->urb.nr_clip_entries % 2 == 0);
6677 /* Although up to 16 concurrent Clip threads are allowed on Ironlake,
6678 * only 2 threads can output VUEs at a time.
6680 clip.MaximumNumberofThreads = (GFX_VER == 5 ? 16 : 2) - 1;
6682 assert(batch->ice->urb.nr_clip_entries >= 5);
6683 clip.MaximumNumberofThreads = 1 - 1;
6685 clip.VertexPositionSpace = VPOS_NDCSPACE;
6686 clip.UserClipFlagsMustClipEnable = true;
6687 clip.GuardbandClipTestEnable = true;
6689 clip.ClipperViewportStatePointer = ro_bo(batch->state.bo, ice->state.clip_vp_address);
6690 clip.ScreenSpaceViewportXMin = -1.0;
6691 clip.ScreenSpaceViewportXMax = 1.0;
6692 clip.ScreenSpaceViewportYMin = -1.0;
6693 clip.ScreenSpaceViewportYMax = 1.0;
6694 clip.ViewportXYClipTestEnable = true;
6695 clip.ViewportZClipTestEnable = (cso_state->depth_clip_near || cso_state->depth_clip_far);
6697 #if GFX_VER == 5 || GFX_VERx10 == 45
6698 clip.UserClipDistanceClipTestEnableBitmask = cso_state->clip_plane_enable;
6700 /* Up to 6 actual clip flags, plus the 7th for the negative RHW
6703 clip.UserClipDistanceClipTestEnableBitmask = (cso_state->clip_plane_enable & 0x3f) | 0x40;
6706 clip.APIMode = cso_state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
6707 clip.GuardbandClipTestEnable = true;
6709 clip.ClipMode = clip_prog_data->clip_mode;
6710 #if GFX_VERx10 == 45
6711 clip.NegativeWClipTestEnable = true;
6715 #else //if GFX_VER >= 6
6716 struct crocus_rasterizer_state *cso_rast = ice->state.cso_rast;
6717 const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data );
6718 struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6719 bool gs_or_tes = ice->shaders.prog[MESA_SHADER_GEOMETRY] ||
6720 ice->shaders.prog[MESA_SHADER_TESS_EVAL];
6721 bool points_or_lines = cso_rast->fill_mode_point_or_line ||
6722 (gs_or_tes ? ice->shaders.output_topology_is_points_or_lines
6723 : ice->state.prim_is_points_or_lines);
6724 uint32_t dynamic_clip[GENX(3DSTATE_CLIP_length)];
6725 crocus_pack_command(GENX(3DSTATE_CLIP), &dynamic_clip, cl) {
6726 cl.StatisticsEnable = ice->state.statistics_counters_enabled;
6727 if (cso_rast->cso.rasterizer_discard)
6728 cl.ClipMode = CLIPMODE_REJECT_ALL;
6729 else if (ice->state.window_space_position)
6730 cl.ClipMode = CLIPMODE_ACCEPT_ALL;
6732 cl.ClipMode = CLIPMODE_NORMAL;
6734 cl.PerspectiveDivideDisable = ice->state.window_space_position;
6735 cl.ViewportXYClipTestEnable = !points_or_lines;
6737 cl.UserClipDistanceCullTestEnableBitmask =
6738 brw_vue_prog_data(ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data)->cull_distance_mask;
6740 cl.NonPerspectiveBarycentricEnable = wm_prog_data->uses_nonperspective_interp_modes;
6742 cl.ForceZeroRTAIndexEnable = cso_fb->layers <= 1;
6743 cl.MaximumVPIndex = ice->state.num_viewports - 1;
6745 crocus_emit_merge(batch, cso_rast->clip, dynamic_clip,
6746 ARRAY_SIZE(cso_rast->clip));
6750 if (stage_dirty & CROCUS_STAGE_DIRTY_VS) {
6751 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_VERTEX];
6752 const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
6753 const struct brw_stage_prog_data *prog_data = &vue_prog_data->base;
6755 if (batch->screen->devinfo.platform == INTEL_PLATFORM_IVB)
6756 gen7_emit_vs_workaround_flush(batch);
6761 struct push_bos push_bos = {};
6762 setup_constant_buffers(ice, batch, MESA_SHADER_VERTEX, &push_bos);
6764 emit_push_constant_packets(ice, batch, MESA_SHADER_VERTEX, &push_bos);
6767 crocus_emit_cmd(batch, GENX(3DSTATE_VS), vs)
6769 uint32_t *vs_ptr = stream_state(batch,
6770 GENX(VS_STATE_length) * 4, 32, &ice->shaders.vs_offset);
6771 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6772 _crocus_pack_state(batch, GENX(VS_STATE), vs_ptr, vs)
6775 INIT_THREAD_DISPATCH_FIELDS(vs, Vertex, MESA_SHADER_VERTEX);
6777 vs.MaximumNumberofThreads = batch->screen->devinfo.max_vs_threads - 1;
6780 vs.GRFRegisterCount = DIV_ROUND_UP(vue_prog_data->total_grf, 16) - 1;
6781 vs.ConstantURBEntryReadLength = vue_prog_data->base.curb_read_length;
6782 vs.ConstantURBEntryReadOffset = ice->curbe.vs_start * 2;
6784 vs.NumberofURBEntries = batch->ice->urb.nr_vs_entries >> (GFX_VER == 5 ? 2 : 0);
6785 vs.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6787 vs.MaximumNumberofThreads =
6788 CLAMP(batch->ice->urb.nr_vs_entries / 2, 1, batch->screen->devinfo.max_vs_threads) - 1;
6789 vs.StatisticsEnable = false;
6790 vs.SamplerStatePointer = ro_bo(batch->state.bo, ice->state.shaders[MESA_SHADER_VERTEX].sampler_offset);
6793 /* Force single program flow on Ironlake. We cannot reliably get
6794 * all applications working without it. See:
6795 * https://bugs.freedesktop.org/show_bug.cgi?id=29172
6797 * The most notable and reliably failing application is the Humus
6800 vs.SingleProgramFlow = true;
6801 vs.SamplerCount = 0; /* hardware requirement */
6805 vs.SIMD8DispatchEnable =
6806 vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8;
6808 vs.UserClipDistanceCullTestEnableBitmask =
6809 vue_prog_data->cull_distance_mask;
6814 crocus_emit_pipe_control_flush(batch,
6816 PIPE_CONTROL_DEPTH_STALL |
6817 PIPE_CONTROL_INSTRUCTION_INVALIDATE |
6818 PIPE_CONTROL_STATE_CACHE_INVALIDATE);
6822 if (stage_dirty & CROCUS_STAGE_DIRTY_GS) {
6823 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_GEOMETRY];
6824 bool active = GFX_VER >= 6 && shader;
6826 struct push_bos push_bos = {};
6828 setup_constant_buffers(ice, batch, MESA_SHADER_GEOMETRY, &push_bos);
6830 emit_push_constant_packets(ice, batch, MESA_SHADER_GEOMETRY, &push_bos);
6832 #if GFX_VERx10 == 70
6834 * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
6835 * Geometry > Geometry Shader > State:
6837 * "Note: Because of corruption in IVB:GT2, software needs to flush the
6838 * whole fixed function pipeline when the GS enable changes value in
6841 * The hardware architects have clarified that in this context "flush the
6842 * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
6845 if (batch->screen->devinfo.gt == 2 && ice->state.gs_enabled != active)
6846 gen7_emit_cs_stall_flush(batch);
6849 crocus_emit_cmd(batch, GENX(3DSTATE_GS), gs)
6851 uint32_t *gs_ptr = stream_state(batch,
6852 GENX(GS_STATE_length) * 4, 32, &ice->shaders.gs_offset);
6853 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
6854 _crocus_pack_state(batch, GENX(GS_STATE), gs_ptr, gs)
6859 const struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(shader->prog_data);
6860 const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
6861 const struct brw_stage_prog_data *prog_data = &gs_prog_data->base.base;
6863 INIT_THREAD_DISPATCH_FIELDS(gs, Vertex, MESA_SHADER_GEOMETRY);
6865 gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
6866 gs.OutputTopology = gs_prog_data->output_topology;
6867 gs.ControlDataHeaderSize =
6868 gs_prog_data->control_data_header_size_hwords;
6870 gs.InstanceControl = gs_prog_data->invocations - 1;
6871 gs.DispatchMode = vue_prog_data->dispatch_mode;
6873 gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
6875 gs.ControlDataFormat = gs_prog_data->control_data_format;
6878 /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between
6879 * Ivy Bridge and Haswell.
6881 * On Ivy Bridge, setting this bit causes the vertices of a triangle
6882 * strip to be delivered to the geometry shader in an order that does
6883 * not strictly follow the OpenGL spec, but preserves triangle
6884 * orientation. For example, if the vertices are (1, 2, 3, 4, 5), then
6885 * the geometry shader sees triangles:
6887 * (1, 2, 3), (2, 4, 3), (3, 4, 5)
6889 * (Clearing the bit is even worse, because it fails to preserve
6892 * Triangle strips with adjacency always ordered in a way that preserves
6893 * triangle orientation but does not strictly follow the OpenGL spec,
6894 * regardless of the setting of this bit.
6896 * On Haswell, both triangle strips and triangle strips with adjacency
6897 * are always ordered in a way that preserves triangle orientation.
6898 * Setting this bit causes the ordering to strictly follow the OpenGL
6901 * So in either case we want to set the bit. Unfortunately on Ivy
6902 * Bridge this will get the order close to correct but not perfect.
6904 gs.ReorderMode = TRAILING;
6905 gs.MaximumNumberofThreads =
6906 GFX_VER == 8 ? (batch->screen->devinfo.max_gs_threads / 2 - 1) :
6907 (batch->screen->devinfo.max_gs_threads - 1);
6909 gs.SOStatisticsEnable = true;
6910 if (gs_prog_data->num_transform_feedback_bindings)
6911 gs.SVBIPayloadEnable = ice->state.streamout_active;
6913 /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
6914 * was previously done for gen6.
6916 * TODO: test with both disabled to see if the HW is behaving
6917 * as expected, like in gen7.
6919 gs.SingleProgramFlow = true;
6920 gs.VectorMaskEnable = true;
6923 gs.ExpectedVertexCount = gs_prog_data->vertices_in;
6925 if (gs_prog_data->static_vertex_count != -1) {
6926 gs.StaticOutput = true;
6927 gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count;
6929 gs.IncludeVertexHandles = vue_prog_data->include_vue_handles;
6931 gs.UserClipDistanceCullTestEnableBitmask =
6932 vue_prog_data->cull_distance_mask;
6934 const int urb_entry_write_offset = 1;
6935 const uint32_t urb_entry_output_length =
6936 DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) -
6937 urb_entry_write_offset;
6939 gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
6940 gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
6945 if (!active && ice->shaders.ff_gs_prog) {
6946 const struct brw_ff_gs_prog_data *gs_prog_data = (struct brw_ff_gs_prog_data *)ice->shaders.ff_gs_prog->prog_data;
6947 /* In gen6, transform feedback for the VS stage is done with an
6948 * ad-hoc GS program. This function provides the needed 3DSTATE_GS
6951 gs.KernelStartPointer = KSP(ice, ice->shaders.ff_gs_prog);
6952 gs.SingleProgramFlow = true;
6953 gs.DispatchGRFStartRegisterForURBData = GFX_VER == 6 ? 2 : 1;
6954 gs.VertexURBEntryReadLength = gs_prog_data->urb_read_length;
6957 gs.GRFRegisterCount =
6958 DIV_ROUND_UP(gs_prog_data->total_grf, 16) - 1;
6959 /* BRW_NEW_URB_FENCE */
6960 gs.NumberofURBEntries = batch->ice->urb.nr_gs_entries;
6961 gs.URBEntryAllocationSize = batch->ice->urb.vsize - 1;
6962 gs.MaximumNumberofThreads = batch->ice->urb.nr_gs_entries >= 8 ? 1 : 0;
6963 gs.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
6966 gs.VectorMaskEnable = true;
6967 gs.SVBIPayloadEnable = true;
6968 gs.SVBIPostIncrementEnable = true;
6969 gs.SVBIPostIncrementValue = gs_prog_data->svbi_postincrement_value;
6970 gs.SOStatisticsEnable = true;
6971 gs.MaximumNumberofThreads = batch->screen->devinfo.max_gs_threads - 1;
6975 if (!active && !ice->shaders.ff_gs_prog) {
6977 gs.DispatchGRFStartRegisterForURBData = 1;
6979 gs.IncludeVertexHandles = true;
6984 gs.StatisticsEnable = true;
6986 #if GFX_VER == 5 || GFX_VER == 6
6987 gs.RenderingEnabled = true;
6990 gs.MaximumVPIndex = ice->state.num_viewports - 1;
6993 ice->state.gs_enabled = active;
6997 if (stage_dirty & CROCUS_STAGE_DIRTY_TCS) {
6998 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_CTRL];
7001 const struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(shader->prog_data);
7002 const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
7003 const struct brw_stage_prog_data *prog_data = &tcs_prog_data->base.base;
7005 crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs) {
7006 INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL);
7007 hs.InstanceCount = tcs_prog_data->instances - 1;
7008 hs.IncludeVertexHandles = true;
7009 hs.MaximumNumberofThreads = batch->screen->devinfo.max_tcs_threads - 1;
7012 crocus_emit_cmd(batch, GENX(3DSTATE_HS), hs);
7017 if (stage_dirty & CROCUS_STAGE_DIRTY_TES) {
7018 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_TESS_EVAL];
7020 const struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(shader->prog_data);
7021 const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(shader->prog_data);
7022 const struct brw_stage_prog_data *prog_data = &tes_prog_data->base.base;
7024 crocus_emit_cmd(batch, GENX(3DSTATE_TE), te) {
7025 te.Partitioning = tes_prog_data->partitioning;
7026 te.OutputTopology = tes_prog_data->output_topology;
7027 te.TEDomain = tes_prog_data->domain;
7029 te.MaximumTessellationFactorOdd = 63.0;
7030 te.MaximumTessellationFactorNotOdd = 64.0;
7032 crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds) {
7033 INIT_THREAD_DISPATCH_FIELDS(ds, Patch, MESA_SHADER_TESS_EVAL);
7035 ds.MaximumNumberofThreads = batch->screen->devinfo.max_tes_threads - 1;
7036 ds.ComputeWCoordinateEnable =
7037 tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
7040 if (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8)
7041 ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
7042 ds.UserClipDistanceCullTestEnableBitmask =
7043 vue_prog_data->cull_distance_mask;
7047 crocus_emit_cmd(batch, GENX(3DSTATE_TE), te);
7048 crocus_emit_cmd(batch, GENX(3DSTATE_DS), ds);
7052 if (dirty & CROCUS_DIRTY_RASTER) {
7055 const struct brw_sf_prog_data *sf_prog_data = (struct brw_sf_prog_data *)ice->shaders.sf_prog->prog_data;
7056 struct pipe_rasterizer_state *cso_state = &ice->state.cso_rast->cso;
7057 uint32_t *sf_ptr = stream_state(batch,
7058 GENX(SF_STATE_length) * 4, 32, &ice->shaders.sf_offset);
7059 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
7060 _crocus_pack_state(batch, GENX(SF_STATE), sf_ptr, sf) {
7061 sf.KernelStartPointer = KSP(ice, ice->shaders.sf_prog);
7062 sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
7063 sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1;
7064 sf.DispatchGRFStartRegisterForURBData = 3;
7065 sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET;
7066 sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length;
7067 sf.URBEntryAllocationSize = batch->ice->urb.sfsize - 1;
7068 sf.NumberofURBEntries = batch->ice->urb.nr_sf_entries;
7069 sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
7071 sf.SetupViewportStateOffset = ro_bo(batch->state.bo, ice->state.sf_vp_address);
7073 sf.MaximumNumberofThreads =
7074 MIN2(GFX_VER == 5 ? 48 : 24, batch->ice->urb.nr_sf_entries) - 1;
7076 sf.SpritePointEnable = cso_state->point_quad_rasterization;
7077 sf.DestinationOriginHorizontalBias = 0.5;
7078 sf.DestinationOriginVerticalBias = 0.5;
7080 sf.LineEndCapAntialiasingRegionWidth =
7081 cso_state->line_smooth ? _10pixels : _05pixels;
7082 sf.LastPixelEnable = cso_state->line_last_pixel;
7083 sf.AntialiasingEnable = cso_state->line_smooth;
7085 sf.LineWidth = get_line_width(cso_state);
7086 sf.PointWidth = cso_state->point_size;
7087 sf.PointWidthSource = cso_state->point_size_per_vertex ? Vertex : State;
7088 #if GFX_VERx10 >= 45
7089 sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
7091 sf.ViewportTransformEnable = true;
7092 sf.FrontWinding = cso_state->front_ccw ? 1 : 0;
7093 sf.ScissorRectangleEnable = true;
7094 sf.CullMode = translate_cull_mode(cso_state->cull_face);
7096 if (cso_state->flatshade_first) {
7097 sf.TriangleFanProvokingVertexSelect = 1;
7099 sf.TriangleStripListProvokingVertexSelect = 2;
7100 sf.TriangleFanProvokingVertexSelect = 2;
7101 sf.LineStripListProvokingVertexSelect = 1;
7105 struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7106 uint32_t dynamic_sf[GENX(3DSTATE_SF_length)];
7107 crocus_pack_command(GENX(3DSTATE_SF), &dynamic_sf, sf) {
7108 sf.ViewportTransformEnable = !ice->state.window_space_position;
7111 const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
7112 uint32_t urb_entry_read_length;
7113 uint32_t urb_entry_read_offset;
7114 uint32_t point_sprite_enables;
7115 calculate_attr_overrides(ice, sf.Attribute, &point_sprite_enables,
7116 &urb_entry_read_length,
7117 &urb_entry_read_offset);
7118 sf.VertexURBEntryReadLength = urb_entry_read_length;
7119 sf.VertexURBEntryReadOffset = urb_entry_read_offset;
7120 sf.PointSpriteTextureCoordinateEnable = point_sprite_enables;
7121 sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
7122 sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
7125 #if GFX_VER >= 6 && GFX_VER < 8
7126 if (ice->state.framebuffer.samples > 1 && ice->state.cso_rast->cso.multisample)
7127 sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
7130 if (ice->state.framebuffer.zsbuf) {
7131 struct crocus_resource *zres, *sres;
7132 crocus_get_depth_stencil_resources(&batch->screen->devinfo,
7133 ice->state.framebuffer.zsbuf->texture,
7135 /* ANV thinks that the stencil-ness doesn't matter, this is just
7136 * about handling polygon offset scaling.
7138 sf.DepthBufferSurfaceFormat = zres ? isl_format_get_depth_format(zres->surf.format, false) : D16_UNORM;
7142 crocus_emit_merge(batch, cso->sf, dynamic_sf,
7143 ARRAY_SIZE(dynamic_sf));
7145 crocus_batch_emit(batch, cso->raster, sizeof(cso->raster));
7150 if (dirty & CROCUS_DIRTY_WM) {
7151 struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7152 const struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data);
7153 UNUSED bool writes_depth = wm_prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF;
7154 UNUSED const struct shader_info *fs_info =
7155 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
7158 struct push_bos push_bos = {};
7159 setup_constant_buffers(ice, batch, MESA_SHADER_FRAGMENT, &push_bos);
7161 emit_push_constant_packets(ice, batch, MESA_SHADER_FRAGMENT, &push_bos);
7164 crocus_emit_cmd(batch, GENX(3DSTATE_WM), wm)
7166 uint32_t *wm_ptr = stream_state(batch,
7167 GENX(WM_STATE_length) * 4, 32, &ice->shaders.wm_offset);
7169 dirty |= CROCUS_DIRTY_GEN5_PIPELINED_POINTERS;
7171 _crocus_pack_state(batch, GENX(WM_STATE), wm_ptr, wm)
7175 wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
7176 wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
7177 wm._32PixelDispatchEnable = wm_prog_data->dispatch_32;
7180 /* On gen4, we only have one shader kernel */
7181 if (brw_wm_state_has_ksp(wm, 0)) {
7182 wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]);
7183 wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
7184 wm.DispatchGRFStartRegisterForConstantSetupData0 =
7185 wm_prog_data->base.dispatch_grf_start_reg;
7188 wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7189 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
7190 wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7191 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
7192 wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7193 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
7195 wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
7196 wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 1);
7197 wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 2);
7199 wm.DispatchGRFStartRegisterForConstantSetupData0 =
7200 wm_prog_data->base.dispatch_grf_start_reg;
7202 wm.KernelStartPointer0 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7203 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
7204 wm.KernelStartPointer1 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7205 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
7206 wm.KernelStartPointer2 = KSP(ice, ice->shaders.prog[MESA_SHADER_FRAGMENT]) +
7207 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
7209 wm.DispatchGRFStartRegisterForConstantSetupData0 =
7210 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
7211 wm.DispatchGRFStartRegisterForConstantSetupData1 =
7212 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1);
7213 wm.DispatchGRFStartRegisterForConstantSetupData2 =
7214 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2);
7217 wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
7218 wm.ConstantURBEntryReadOffset = ice->curbe.wm_start * 2;
7219 wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
7220 wm.SetupURBEntryReadOffset = 0;
7221 wm.EarlyDepthTestEnable = true;
7222 wm.LineAntialiasingRegionWidth = _05pixels;
7223 wm.LineEndCapAntialiasingRegionWidth = _10pixels;
7224 wm.DepthCoefficientURBReadOffset = 1;
7226 if (cso->cso.offset_tri) {
7227 wm.GlobalDepthOffsetEnable = true;
7229 /* Something weird going on with legacy_global_depth_bias,
7230 * offset_constant, scaling and MRD. This value passes glean
7231 * but gives some odd results elsewere (eg. the
7232 * quad-offset-units test).
7234 wm.GlobalDepthOffsetConstant = cso->cso.offset_units * 2;
7235 wm.GlobalDepthOffsetScale = cso->cso.offset_scale;
7237 wm.SamplerStatePointer = ro_bo(batch->state.bo,
7238 ice->state.shaders[MESA_SHADER_FRAGMENT].sampler_offset);
7241 wm.StatisticsEnable = (GFX_VER >= 6 || ice->state.stats_wm) ?
7242 ice->state.statistics_counters_enabled : 0;
7245 wm.LineAntialiasingRegionWidth = _10pixels;
7246 wm.LineEndCapAntialiasingRegionWidth = _05pixels;
7248 wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
7249 wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
7252 wm.DualSourceBlendEnable = wm_prog_data->dual_src_blend &&
7253 ice->state.cso_blend->dual_color_blending;
7254 wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
7255 wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
7257 /* From the SNB PRM, volume 2 part 1, page 281:
7258 * "If the PS kernel does not need the Position XY Offsets
7259 * to compute a Position XY value, then this field should be
7260 * programmed to POSOFFSET_NONE."
7262 * "SW Recommendation: If the PS kernel needs the Position Offsets
7263 * to compute a Position XY value, this field should match Position
7264 * ZW Interpolation Mode to ensure a consistent position.xyzw
7266 * We only require XY sample offsets. So, this recommendation doesn't
7267 * look useful at the moment. We might need this in future.
7269 if (wm_prog_data->uses_pos_offset)
7270 wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
7272 wm.PositionXYOffsetSelect = POSOFFSET_NONE;
7274 wm.LineStippleEnable = cso->cso.line_stipple_enable;
7275 wm.PolygonStippleEnable = cso->cso.poly_stipple_enable;
7278 if (wm_prog_data->base.use_alt_mode)
7279 wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
7280 wm.BindingTableEntryCount = ice->shaders.prog[MESA_SHADER_FRAGMENT]->bt.size_bytes / 4;
7281 wm.MaximumNumberofThreads = batch->screen->devinfo.max_wm_threads - 1;
7286 wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
7288 struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
7289 if (fb->samples > 1) {
7290 if (cso->cso.multisample)
7291 wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
7293 wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
7295 if (wm_prog_data->persample_dispatch)
7296 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
7298 wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
7300 wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
7301 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
7305 wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
7307 if (wm_prog_data->uses_kill ||
7308 ice->state.cso_zsa->cso.alpha_enabled ||
7309 ice->state.cso_blend->cso.alpha_to_coverage ||
7310 (GFX_VER >= 6 && wm_prog_data->uses_omask))
7311 wm.PixelShaderKillsPixel = true;
7313 if (has_writeable_rt(ice->state.cso_blend, fs_info) ||
7314 writes_depth || wm.PixelShaderKillsPixel ||
7315 (GFX_VER >= 6 && wm_prog_data->has_side_effects))
7316 wm.ThreadDispatchEnable = true;
7319 wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
7320 wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
7322 if (wm_prog_data->base.total_scratch) {
7323 struct crocus_bo *bo = crocus_get_scratch_space(ice, wm_prog_data->base.total_scratch,
7324 MESA_SHADER_FRAGMENT);
7325 wm.PerThreadScratchSpace = ffs(wm_prog_data->base.total_scratch) - 11;
7326 wm.ScratchSpaceBasePointer = rw_bo(bo, 0);
7329 wm.PixelShaderComputedDepth = writes_depth;
7332 /* The "UAV access enable" bits are unnecessary on HSW because they only
7333 * seem to have an effect on the HW-assisted coherency mechanism which we
7334 * don't need, and the rasterization-related UAV_ONLY flag and the
7335 * DISPATCH_ENABLE bit can be set independently from it.
7336 * C.f. gen8_upload_ps_extra().
7338 * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS |
7341 #if GFX_VERx10 == 75
7342 if (!(has_writeable_rt(ice->state.cso_blend, fs_info) || writes_depth) &&
7343 wm_prog_data->has_side_effects)
7348 /* BRW_NEW_FS_PROG_DATA */
7349 if (wm_prog_data->early_fragment_tests)
7350 wm.EarlyDepthStencilControl = EDSC_PREPS;
7351 else if (wm_prog_data->has_side_effects)
7352 wm.EarlyDepthStencilControl = EDSC_PSEXEC;
7355 /* We could skip this bit if color writes are enabled. */
7356 if (wm_prog_data->has_side_effects || wm_prog_data->uses_kill)
7357 wm.ForceThreadDispatchEnable = ForceON;
7362 if (ice->state.global_depth_offset_clamp != cso->cso.offset_clamp) {
7363 crocus_emit_cmd(batch, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp) {
7364 clamp.GlobalDepthOffsetClamp = cso->cso.offset_clamp;
7366 ice->state.global_depth_offset_clamp = cso->cso.offset_clamp;
7372 if (dirty & CROCUS_DIRTY_GEN7_SBE) {
7373 crocus_emit_sbe(batch, ice);
7378 if (dirty & CROCUS_DIRTY_GEN8_PS_BLEND) {
7379 struct crocus_compiled_shader *shader = ice->shaders.prog[MESA_SHADER_FRAGMENT];
7380 struct crocus_blend_state *cso_blend = ice->state.cso_blend;
7381 struct crocus_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
7382 struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
7383 const struct shader_info *fs_info =
7384 crocus_get_shader_info(ice, MESA_SHADER_FRAGMENT);
7385 uint32_t dynamic_pb[GENX(3DSTATE_PS_BLEND_length)];
7386 crocus_pack_command(GENX(3DSTATE_PS_BLEND), &dynamic_pb, pb) {
7387 pb.HasWriteableRT = has_writeable_rt(cso_blend, fs_info);
7388 pb.AlphaTestEnable = cso_zsa->cso.alpha_enabled;
7389 pb.ColorBufferBlendEnable = (cso_blend->blend_enables & 1) &&
7390 (!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend);
7392 crocus_emit_merge(batch, cso_blend->ps_blend, dynamic_pb,
7393 ARRAY_SIZE(cso_blend->ps_blend));
7398 if (dirty & CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL) {
7401 crocus_emit_cmd(batch, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) {
7402 set_depth_stencil_bits(ice, &wmds);
7406 void *ds_map = stream_state(batch,
7407 sizeof(uint32_t) * GENX(DEPTH_STENCIL_STATE_length),
7409 _crocus_pack_state(batch, GENX(DEPTH_STENCIL_STATE), ds_map, ds) {
7410 set_depth_stencil_bits(ice, &ds);
7414 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
7415 ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
7416 ptr.DEPTH_STENCIL_STATEChange = true;
7419 crocus_emit_cmd(batch, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) {
7420 ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
7426 if (dirty & CROCUS_DIRTY_GEN6_SCISSOR_RECT) {
7427 /* Align to 64-byte boundary as per anv. */
7428 uint32_t scissor_offset;
7429 struct pipe_scissor_state *scissor_map = (void *)
7430 stream_state(batch, sizeof(struct pipe_scissor_state) * ice->state.num_viewports,
7431 64, &scissor_offset);
7432 for (int i = 0; i < ice->state.num_viewports; i++) {
7433 struct pipe_scissor_state scissor;
7434 crocus_fill_scissor_rect(ice, i, &scissor);
7435 scissor_map[i] = scissor;
7438 crocus_emit_cmd(batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
7439 ptr.ScissorRectPointer = scissor_offset;
7444 if (dirty & CROCUS_DIRTY_DEPTH_BUFFER) {
7445 struct isl_device *isl_dev = &batch->screen->isl_dev;
7447 crocus_emit_depth_stall_flushes(batch);
7450 struct crocus_resource *zres, *sres;
7451 struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
7452 batch_ptr = crocus_get_command_space(batch, isl_dev->ds.size);
7454 struct isl_view view = {
7457 .base_array_layer = 0,
7459 .swizzle = ISL_SWIZZLE_IDENTITY,
7461 struct isl_depth_stencil_hiz_emit_info info = {
7463 .mocs = crocus_mocs(NULL, isl_dev),
7467 crocus_get_depth_stencil_resources(&batch->screen->devinfo, cso->zsbuf->texture, &zres, &sres);
7468 struct crocus_surface *zsbuf = (struct crocus_surface *)cso->zsbuf;
7469 if (zsbuf->align_res) {
7470 zres = (struct crocus_resource *)zsbuf->align_res;
7472 view.base_level = cso->zsbuf->u.tex.level;
7473 view.base_array_layer = cso->zsbuf->u.tex.first_layer;
7474 view.array_len = cso->zsbuf->u.tex.last_layer - cso->zsbuf->u.tex.first_layer + 1;
7477 view.usage |= ISL_SURF_USAGE_DEPTH_BIT;
7479 info.depth_surf = &zres->surf;
7480 info.depth_address = crocus_command_reloc(batch,
7481 (batch_ptr - batch->command.map) + isl_dev->ds.depth_offset,
7482 zres->bo, 0, RELOC_32BIT);
7484 info.mocs = crocus_mocs(zres->bo, isl_dev);
7485 view.format = zres->surf.format;
7487 if (crocus_resource_level_has_hiz(zres, view.base_level)) {
7488 info.hiz_usage = zres->aux.usage;
7489 info.hiz_surf = &zres->aux.surf;
7490 uint64_t hiz_offset = 0;
7493 /* HiZ surfaces on Sandy Bridge technically don't support
7494 * mip-mapping. However, we can fake it by offsetting to the
7495 * first slice of LOD0 in the HiZ surface.
7497 isl_surf_get_image_offset_B_tile_sa(&zres->aux.surf,
7498 view.base_level, 0, 0,
7499 &hiz_offset, NULL, NULL);
7501 info.hiz_address = crocus_command_reloc(batch,
7502 (batch_ptr - batch->command.map) + isl_dev->ds.hiz_offset,
7503 zres->aux.bo, zres->aux.offset + hiz_offset,
7505 info.depth_clear_value = crocus_resource_get_clear_color(zres).f32[0];
7511 view.usage |= ISL_SURF_USAGE_STENCIL_BIT;
7512 info.stencil_aux_usage = sres->aux.usage;
7513 info.stencil_surf = &sres->surf;
7515 uint64_t stencil_offset = 0;
7517 /* Stencil surfaces on Sandy Bridge technically don't support
7518 * mip-mapping. However, we can fake it by offsetting to the
7519 * first slice of LOD0 in the stencil surface.
7521 isl_surf_get_image_offset_B_tile_sa(&sres->surf,
7522 view.base_level, 0, 0,
7523 &stencil_offset, NULL, NULL);
7526 info.stencil_address = crocus_command_reloc(batch,
7527 (batch_ptr - batch->command.map) + isl_dev->ds.stencil_offset,
7528 sres->bo, stencil_offset, RELOC_32BIT);
7530 view.format = sres->surf.format;
7531 info.mocs = crocus_mocs(sres->bo, isl_dev);
7536 isl_emit_depth_stencil_hiz_s(isl_dev, batch_ptr, &info);
7539 /* TODO: Disable emitting this until something uses a stipple. */
7540 if (dirty & CROCUS_DIRTY_POLYGON_STIPPLE) {
7541 crocus_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
7542 for (int i = 0; i < 32; i++) {
7543 poly.PatternRow[i] = ice->state.poly_stipple.stipple[i];
7548 if (dirty & CROCUS_DIRTY_LINE_STIPPLE) {
7549 struct crocus_rasterizer_state *cso = ice->state.cso_rast;
7550 crocus_batch_emit(batch, cso->line_stipple, sizeof(cso->line_stipple));
7554 if (dirty & CROCUS_DIRTY_GEN8_VF_TOPOLOGY) {
7555 crocus_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
7556 topo.PrimitiveTopologyType =
7557 translate_prim_type(draw->mode, ice->state.patch_vertices);
7563 if (dirty & CROCUS_DIRTY_GEN5_PIPELINED_POINTERS) {
7564 upload_pipelined_state_pointers(batch, ice->shaders.ff_gs_prog ? true : false, ice->shaders.gs_offset,
7565 ice->shaders.vs_offset, ice->shaders.sf_offset,
7566 ice->shaders.clip_offset, ice->shaders.wm_offset, ice->shaders.cc_offset);
7567 crocus_upload_urb_fence(batch);
7569 crocus_emit_cmd(batch, GENX(CS_URB_STATE), cs) {
7570 cs.NumberofURBEntries = ice->urb.nr_cs_entries;
7571 cs.URBEntryAllocationSize = ice->urb.csize - 1;
7573 dirty |= CROCUS_DIRTY_GEN4_CURBE;
7576 if (dirty & CROCUS_DIRTY_DRAWING_RECTANGLE) {
7577 struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
7578 if (fb->width && fb->height) {
7579 crocus_emit_cmd(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
7580 rect.ClippedDrawingRectangleXMax = fb->width - 1;
7581 rect.ClippedDrawingRectangleYMax = fb->height - 1;
7586 if (dirty & CROCUS_DIRTY_VERTEX_BUFFERS) {
7587 const uint32_t user_count = util_bitcount(ice->state.bound_vertex_buffers);
7588 const uint32_t count = user_count +
7589 ice->state.vs_uses_draw_params + ice->state.vs_uses_derived_draw_params;
7590 uint32_t dynamic_bound = ice->state.bound_vertex_buffers;
7593 const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
7596 crocus_get_command_space(batch, 4 * (1 + vb_dwords * count));
7597 _crocus_pack_command(batch, GENX(3DSTATE_VERTEX_BUFFERS), map, vb) {
7598 vb.DWordLength = (vb_dwords * count + 1) - 2;
7602 uint32_t bound = dynamic_bound;
7605 i = u_bit_scan(&bound);
7606 struct pipe_vertex_buffer *buf = &ice->state.vertex_buffers[i];
7607 struct crocus_bo *bo = crocus_resource_bo(buf->buffer.resource);
7608 uint32_t step_rate = ice->state.cso_vertex_elements->step_rate[i];
7610 emit_vertex_buffer_state(batch, i, bo,
7612 ice->state.vb_end[i],
7618 if (ice->state.vs_uses_draw_params) {
7619 struct crocus_resource *res = (struct crocus_resource *)ice->draw.draw_params.res;
7620 emit_vertex_buffer_state(batch, i++,
7622 ice->draw.draw_params.offset,
7623 ice->draw.draw_params.res->width0,
7626 if (ice->state.vs_uses_derived_draw_params) {
7627 struct crocus_resource *res = (struct crocus_resource *)ice->draw.derived_draw_params.res;
7628 emit_vertex_buffer_state(batch, i++,
7630 ice->draw.derived_draw_params.offset,
7631 ice->draw.derived_draw_params.res->width0,
7637 if (dirty & CROCUS_DIRTY_VERTEX_ELEMENTS) {
7638 struct crocus_vertex_element_state *cso = ice->state.cso_vertex_elements;
7639 const unsigned entries = MAX2(cso->count, 1);
7640 if (!(ice->state.vs_needs_sgvs_element ||
7641 ice->state.vs_uses_derived_draw_params ||
7642 ice->state.vs_needs_edge_flag)) {
7643 crocus_batch_emit(batch, cso->vertex_elements, sizeof(uint32_t) *
7644 (1 + entries * GENX(VERTEX_ELEMENT_STATE_length)));
7646 uint32_t dynamic_ves[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
7647 const unsigned dyn_count = cso->count +
7648 ice->state.vs_needs_sgvs_element +
7649 ice->state.vs_uses_derived_draw_params;
7651 crocus_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS),
7654 1 + GENX(VERTEX_ELEMENT_STATE_length) * dyn_count - 2;
7656 memcpy(&dynamic_ves[1], &cso->vertex_elements[1],
7657 (cso->count - ice->state.vs_needs_edge_flag) *
7658 GENX(VERTEX_ELEMENT_STATE_length) * sizeof(uint32_t));
7659 uint32_t *ve_pack_dest =
7660 &dynamic_ves[1 + (cso->count - ice->state.vs_needs_edge_flag) *
7661 GENX(VERTEX_ELEMENT_STATE_length)];
7663 if (ice->state.vs_needs_sgvs_element) {
7664 uint32_t base_ctrl = ice->state.vs_uses_draw_params ?
7665 VFCOMP_STORE_SRC : VFCOMP_STORE_0;
7666 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
7668 ve.VertexBufferIndex =
7669 util_bitcount64(ice->state.bound_vertex_buffers);
7670 ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
7671 ve.Component0Control = base_ctrl;
7672 ve.Component1Control = base_ctrl;
7674 ve.Component2Control = ice->state.vs_uses_vertexid ? VFCOMP_STORE_VID : VFCOMP_STORE_0;
7675 ve.Component3Control = ice->state.vs_uses_instanceid ? VFCOMP_STORE_IID : VFCOMP_STORE_0;
7677 ve.Component2Control = VFCOMP_STORE_0;
7678 ve.Component3Control = VFCOMP_STORE_0;
7681 ve.DestinationElementOffset = cso->count * 4;
7684 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
7686 if (ice->state.vs_uses_derived_draw_params) {
7687 crocus_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
7689 ve.VertexBufferIndex =
7690 util_bitcount64(ice->state.bound_vertex_buffers) +
7691 ice->state.vs_uses_draw_params;
7692 ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
7693 ve.Component0Control = VFCOMP_STORE_SRC;
7694 ve.Component1Control = VFCOMP_STORE_SRC;
7695 ve.Component2Control = VFCOMP_STORE_0;
7696 ve.Component3Control = VFCOMP_STORE_0;
7698 ve.DestinationElementOffset = (cso->count + ice->state.vs_needs_sgvs_element) * 4;
7701 ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
7703 if (ice->state.vs_needs_edge_flag) {
7704 for (int i = 0; i < GENX(VERTEX_ELEMENT_STATE_length); i++)
7705 ve_pack_dest[i] = cso->edgeflag_ve[i];
7708 crocus_batch_emit(batch, &dynamic_ves, sizeof(uint32_t) *
7709 (1 + dyn_count * GENX(VERTEX_ELEMENT_STATE_length)));
7713 if (!ice->state.vs_needs_edge_flag) {
7714 crocus_batch_emit(batch, cso->vf_instancing, sizeof(uint32_t) *
7715 entries * GENX(3DSTATE_VF_INSTANCING_length));
7717 assert(cso->count > 0);
7718 const unsigned edgeflag_index = cso->count - 1;
7719 uint32_t dynamic_vfi[33 * GENX(3DSTATE_VF_INSTANCING_length)];
7720 memcpy(&dynamic_vfi[0], cso->vf_instancing, edgeflag_index *
7721 GENX(3DSTATE_VF_INSTANCING_length) * sizeof(uint32_t));
7723 uint32_t *vfi_pack_dest = &dynamic_vfi[0] +
7724 edgeflag_index * GENX(3DSTATE_VF_INSTANCING_length);
7725 crocus_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
7726 vi.VertexElementIndex = edgeflag_index +
7727 ice->state.vs_needs_sgvs_element +
7728 ice->state.vs_uses_derived_draw_params;
7730 for (int i = 0; i < GENX(3DSTATE_VF_INSTANCING_length); i++)
7731 vfi_pack_dest[i] |= cso->edgeflag_vfi[i];
7733 crocus_batch_emit(batch, &dynamic_vfi[0], sizeof(uint32_t) *
7734 entries * GENX(3DSTATE_VF_INSTANCING_length));
7740 if (dirty & CROCUS_DIRTY_GEN8_VF_SGVS) {
7741 const struct brw_vs_prog_data *vs_prog_data = (void *)
7742 ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data;
7743 struct crocus_vertex_element_state *cso = ice->state.cso_vertex_elements;
7745 crocus_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgv) {
7746 if (vs_prog_data->uses_vertexid) {
7747 sgv.VertexIDEnable = true;
7748 sgv.VertexIDComponentNumber = 2;
7749 sgv.VertexIDElementOffset =
7750 cso->count - ice->state.vs_needs_edge_flag;
7753 if (vs_prog_data->uses_instanceid) {
7754 sgv.InstanceIDEnable = true;
7755 sgv.InstanceIDComponentNumber = 3;
7756 sgv.InstanceIDElementOffset =
7757 cso->count - ice->state.vs_needs_edge_flag;
7762 #if GFX_VERx10 >= 75
7763 if (dirty & CROCUS_DIRTY_GEN75_VF) {
7764 crocus_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
7765 if (draw->primitive_restart) {
7766 vf.IndexedDrawCutIndexEnable = true;
7767 vf.CutIndex = draw->restart_index;
7774 if (dirty & CROCUS_DIRTY_GEN8_PMA_FIX) {
7775 bool enable = want_pma_fix(ice);
7776 genX(crocus_update_pma_fix)(ice, batch, enable);
7781 if (dirty & CROCUS_DIRTY_GEN4_CURBE) {
7782 gen4_upload_curbe(batch);
7788 crocus_upload_render_state(struct crocus_context *ice,
7789 struct crocus_batch *batch,
7790 const struct pipe_draw_info *draw,
7791 unsigned drawid_offset,
7792 const struct pipe_draw_indirect_info *indirect,
7793 const struct pipe_draw_start_count_bias *sc)
7796 bool use_predicate = ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT;
7799 batch->no_wrap = true;
7800 batch->contains_draw = true;
7802 crocus_update_surface_base_address(batch);
7804 crocus_upload_dirty_render_state(ice, batch, draw);
7806 batch->no_wrap = false;
7807 if (draw->index_size > 0) {
7810 bool emit_index = false;
7812 if (draw->has_user_indices) {
7813 unsigned start_offset = draw->index_size * sc->start;
7814 u_upload_data(ice->ctx.stream_uploader, 0,
7815 sc->count * draw->index_size, 4,
7816 (char *)draw->index.user + start_offset,
7817 &offset, &ice->state.index_buffer.res);
7818 offset -= start_offset;
7819 size = start_offset + sc->count * draw->index_size;
7822 struct crocus_resource *res = (void *) draw->index.resource;
7824 if (ice->state.index_buffer.res != draw->index.resource) {
7825 res->bind_history |= PIPE_BIND_INDEX_BUFFER;
7826 pipe_resource_reference(&ice->state.index_buffer.res,
7827 draw->index.resource);
7831 size = draw->index.resource->width0;
7835 (ice->state.index_buffer.size != size ||
7836 ice->state.index_buffer.index_size != draw->index_size
7838 || ice->state.index_buffer.prim_restart != draw->primitive_restart
7845 struct crocus_bo *bo = crocus_resource_bo(ice->state.index_buffer.res);
7847 crocus_emit_cmd(batch, GENX(3DSTATE_INDEX_BUFFER), ib) {
7849 ib.CutIndexEnable = draw->primitive_restart;
7851 ib.IndexFormat = draw->index_size >> 1;
7852 ib.BufferStartingAddress = ro_bo(bo, offset);
7854 ib.BufferSize = bo->size - offset;
7856 ib.BufferEndingAddress = ro_bo(bo, offset + size - 1);
7859 ib.MOCS = crocus_mocs(bo, &batch->screen->isl_dev);
7862 ice->state.index_buffer.size = size;
7863 ice->state.index_buffer.offset = offset;
7864 ice->state.index_buffer.index_size = draw->index_size;
7866 ice->state.index_buffer.prim_restart = draw->primitive_restart;
7871 #define _3DPRIM_END_OFFSET 0x2420
7872 #define _3DPRIM_START_VERTEX 0x2430
7873 #define _3DPRIM_VERTEX_COUNT 0x2434
7874 #define _3DPRIM_INSTANCE_COUNT 0x2438
7875 #define _3DPRIM_START_INSTANCE 0x243C
7876 #define _3DPRIM_BASE_VERTEX 0x2440
7879 if (indirect && !indirect->count_from_stream_output) {
7880 if (indirect->indirect_draw_count) {
7881 use_predicate = true;
7883 struct crocus_bo *draw_count_bo =
7884 crocus_resource_bo(indirect->indirect_draw_count);
7885 unsigned draw_count_offset =
7886 indirect->indirect_draw_count_offset;
7888 crocus_emit_pipe_control_flush(batch,
7889 "ensure indirect draw buffer is flushed",
7890 PIPE_CONTROL_FLUSH_ENABLE);
7891 if (ice->state.predicate == CROCUS_PREDICATE_STATE_USE_BIT) {
7892 #if GFX_VERx10 >= 75
7893 struct mi_builder b;
7894 mi_builder_init(&b, &batch->screen->devinfo, batch);
7896 /* comparison = draw id < draw count */
7897 struct mi_value comparison =
7898 mi_ult(&b, mi_imm(drawid_offset),
7899 mi_mem32(ro_bo(draw_count_bo,
7900 draw_count_offset)));
7902 /* predicate = comparison & conditional rendering predicate */
7903 mi_store(&b, mi_reg32(MI_PREDICATE_RESULT),
7904 mi_iand(&b, comparison, mi_reg32(CS_GPR(15))));
7906 /* predicate = comparison & conditional rendering predicate */
7907 struct mi_value pred = mi_iand(&b, comparison,
7908 mi_reg32(CS_GPR(15)));
7910 mi_store(&b, mi_reg64(MI_PREDICATE_SRC0), pred);
7911 mi_store(&b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(0));
7913 unsigned mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
7914 MI_PREDICATE_COMBINEOP_SET |
7915 MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7917 crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
7921 uint32_t mi_predicate;
7923 /* Upload the id of the current primitive to MI_PREDICATE_SRC1. */
7924 crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, drawid_offset);
7925 /* Upload the current draw count from the draw parameters buffer
7926 * to MI_PREDICATE_SRC0.
7928 crocus_load_register_mem32(batch, MI_PREDICATE_SRC0,
7929 draw_count_bo, draw_count_offset);
7930 /* Zero the top 32-bits of MI_PREDICATE_SRC0 */
7931 crocus_load_register_imm32(batch, MI_PREDICATE_SRC0 + 4, 0);
7933 if (drawid_offset == 0) {
7934 mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
7935 MI_PREDICATE_COMBINEOP_SET |
7936 MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7938 /* While draw_index < draw_count the predicate's result will be
7939 * (draw_index == draw_count) ^ TRUE = TRUE
7940 * When draw_index == draw_count the result is
7941 * (TRUE) ^ TRUE = FALSE
7942 * After this all results will be:
7943 * (FALSE) ^ FALSE = FALSE
7945 mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOAD |
7946 MI_PREDICATE_COMBINEOP_XOR |
7947 MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
7949 crocus_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
7954 struct crocus_bo *bo = crocus_resource_bo(indirect->buffer);
7957 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7958 lrm.RegisterAddress = _3DPRIM_VERTEX_COUNT;
7959 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 0);
7961 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7962 lrm.RegisterAddress = _3DPRIM_INSTANCE_COUNT;
7963 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 4);
7965 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7966 lrm.RegisterAddress = _3DPRIM_START_VERTEX;
7967 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 8);
7969 if (draw->index_size) {
7970 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7971 lrm.RegisterAddress = _3DPRIM_BASE_VERTEX;
7972 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);
7974 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7975 lrm.RegisterAddress = _3DPRIM_START_INSTANCE;
7976 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 16);
7979 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
7980 lrm.RegisterAddress = _3DPRIM_START_INSTANCE;
7981 lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);
7983 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
7984 lri.RegisterOffset = _3DPRIM_BASE_VERTEX;
7989 } else if (indirect && indirect->count_from_stream_output) {
7990 #if GFX_VERx10 >= 75
7991 struct crocus_stream_output_target *so =
7992 (void *) indirect->count_from_stream_output;
7994 /* XXX: Replace with actual cache tracking */
7995 crocus_emit_pipe_control_flush(batch,
7996 "draw count from stream output stall",
7997 PIPE_CONTROL_CS_STALL);
7999 struct mi_builder b;
8000 mi_builder_init(&b, &batch->screen->devinfo, batch);
8002 struct crocus_address addr =
8003 ro_bo(crocus_resource_bo(&so->offset_res->base.b), so->offset_offset);
8004 struct mi_value offset =
8005 mi_iadd_imm(&b, mi_mem32(addr), -so->base.buffer_offset);
8007 mi_store(&b, mi_reg32(_3DPRIM_VERTEX_COUNT),
8008 mi_udiv32_imm(&b, offset, so->stride));
8010 _crocus_emit_lri(batch, _3DPRIM_START_VERTEX, 0);
8011 _crocus_emit_lri(batch, _3DPRIM_BASE_VERTEX, 0);
8012 _crocus_emit_lri(batch, _3DPRIM_START_INSTANCE, 0);
8013 _crocus_emit_lri(batch, _3DPRIM_INSTANCE_COUNT, draw->instance_count);
8020 crocus_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
8021 prim.VertexAccessType = draw->index_size > 0 ? RANDOM : SEQUENTIAL;
8023 prim.PredicateEnable = use_predicate;
8026 prim.PrimitiveTopologyType = translate_prim_type(ice->state.prim_mode, ice->state.patch_vertices);
8028 // XXX Probably have to do something for gen6 here?
8030 prim.IndirectParameterEnable = true;
8034 prim.StartInstanceLocation = draw->start_instance;
8036 prim.InstanceCount = draw->instance_count;
8037 prim.VertexCountPerInstance = sc->count;
8039 prim.StartVertexLocation = sc->start;
8041 if (draw->index_size) {
8042 prim.BaseVertexLocation += sc->index_bias;
8051 crocus_upload_compute_state(struct crocus_context *ice,
8052 struct crocus_batch *batch,
8053 const struct pipe_grid_info *grid)
8055 const uint64_t stage_dirty = ice->state.stage_dirty;
8056 struct crocus_screen *screen = batch->screen;
8057 const struct intel_device_info *devinfo = &screen->devinfo;
8058 struct crocus_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
8059 struct crocus_compiled_shader *shader =
8060 ice->shaders.prog[MESA_SHADER_COMPUTE];
8061 struct brw_stage_prog_data *prog_data = shader->prog_data;
8062 struct brw_cs_prog_data *cs_prog_data = (void *) prog_data;
8063 const struct brw_cs_dispatch_info dispatch =
8064 brw_cs_get_dispatch_info(devinfo, cs_prog_data, grid->block);
8066 crocus_update_surface_base_address(batch);
8067 if ((stage_dirty & CROCUS_STAGE_DIRTY_CONSTANTS_CS) && shs->sysvals_need_upload)
8068 upload_sysvals(ice, MESA_SHADER_COMPUTE);
8070 if (stage_dirty & CROCUS_STAGE_DIRTY_BINDINGS_CS) {
8071 crocus_populate_binding_table(ice, batch, MESA_SHADER_COMPUTE, false);
8072 ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset =
8073 crocus_upload_binding_table(ice, batch,
8074 ice->shaders.prog[MESA_SHADER_COMPUTE]->surf_offset,
8075 ice->shaders.prog[MESA_SHADER_COMPUTE]->bt.size_bytes);
8078 if (stage_dirty & CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS)
8079 crocus_upload_sampler_states(ice, batch, MESA_SHADER_COMPUTE);
8081 if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) ||
8082 cs_prog_data->local_size[0] == 0 /* Variable local group size */) {
8083 /* The MEDIA_VFE_STATE documentation for Gen8+ says:
8085 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
8086 * the only bits that are changed are scoreboard related: Scoreboard
8087 * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard Delta. For
8088 * these scoreboard related states, a MEDIA_STATE_FLUSH is
8091 crocus_emit_pipe_control_flush(batch,
8092 "workaround: stall before MEDIA_VFE_STATE",
8093 PIPE_CONTROL_CS_STALL);
8095 crocus_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) {
8096 if (prog_data->total_scratch) {
8097 struct crocus_bo *bo =
8098 crocus_get_scratch_space(ice, prog_data->total_scratch,
8099 MESA_SHADER_COMPUTE);
8101 /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
8102 * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
8104 vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11;
8105 #elif GFX_VERx10 == 75
8106 /* Haswell's Per Thread Scratch Space is in the range [0, 10]
8107 * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
8109 vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 12;
8111 /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
8112 * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
8114 vfe.PerThreadScratchSpace = prog_data->total_scratch / 1024 - 1;
8116 vfe.ScratchSpaceBasePointer = rw_bo(bo, 0);
8119 vfe.MaximumNumberofThreads =
8120 devinfo->max_cs_threads * devinfo->subslice_total - 1;
8121 vfe.ResetGatewayTimer =
8122 Resettingrelativetimerandlatchingtheglobaltimestamp;
8123 vfe.BypassGatewayControl = true;
8125 vfe.GPGPUMode = true;
8128 vfe.BypassGatewayControl = true;
8130 vfe.NumberofURBEntries = GFX_VER == 8 ? 2 : 0;
8131 vfe.URBEntryAllocationSize = GFX_VER == 8 ? 2 : 0;
8133 vfe.CURBEAllocationSize =
8134 ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
8135 cs_prog_data->push.cross_thread.regs, 2);
8139 /* TODO: Combine subgroup-id with cbuf0 so we can push regular uniforms */
8140 if ((stage_dirty & CROCUS_STAGE_DIRTY_CS) ||
8141 cs_prog_data->local_size[0] == 0 /* Variable local group size */) {
8142 uint32_t curbe_data_offset = 0;
8143 assert(cs_prog_data->push.cross_thread.dwords == 0 &&
8144 cs_prog_data->push.per_thread.dwords == 1 &&
8145 cs_prog_data->base.param[0] == BRW_PARAM_BUILTIN_SUBGROUP_ID);
8146 const unsigned push_const_size =
8147 brw_cs_push_const_total_size(cs_prog_data, dispatch.threads);
8148 uint32_t *curbe_data_map =
8150 ALIGN(push_const_size, 64), 64,
8151 &curbe_data_offset);
8152 assert(curbe_data_map);
8153 memset(curbe_data_map, 0x5a, ALIGN(push_const_size, 64));
8154 crocus_fill_cs_push_const_buffer(cs_prog_data, dispatch.threads,
8157 crocus_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
8158 curbe.CURBETotalDataLength = ALIGN(push_const_size, 64);
8159 curbe.CURBEDataStartAddress = curbe_data_offset;
8163 if (stage_dirty & (CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS |
8164 CROCUS_STAGE_DIRTY_BINDINGS_CS |
8165 CROCUS_STAGE_DIRTY_CONSTANTS_CS |
8166 CROCUS_STAGE_DIRTY_CS)) {
8167 uint32_t desc[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
8168 const uint64_t ksp = KSP(ice,shader) + brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size);
8169 crocus_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) {
8170 idd.KernelStartPointer = ksp;
8171 idd.SamplerStatePointer = shs->sampler_offset;
8172 idd.BindingTablePointer = ice->shaders.prog[MESA_SHADER_COMPUTE]->bind_bo_offset;
8173 idd.BindingTableEntryCount = MIN2(shader->bt.size_bytes / 4, 31);
8174 idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
8175 idd.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs;
8176 idd.BarrierEnable = cs_prog_data->uses_barrier;
8177 idd.SharedLocalMemorySize = encode_slm_size(GFX_VER,
8178 prog_data->total_shared);
8179 #if GFX_VERx10 >= 75
8180 idd.CrossThreadConstantDataReadLength = cs_prog_data->push.cross_thread.regs;
8184 crocus_emit_cmd(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
8185 load.InterfaceDescriptorTotalLength =
8186 GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
8187 load.InterfaceDescriptorDataStartAddress =
8188 emit_state(batch, desc, sizeof(desc), 64);
8192 #define GPGPU_DISPATCHDIMX 0x2500
8193 #define GPGPU_DISPATCHDIMY 0x2504
8194 #define GPGPU_DISPATCHDIMZ 0x2508
8196 if (grid->indirect) {
8197 struct crocus_state_ref *grid_size = &ice->state.grid_size;
8198 struct crocus_bo *bo = crocus_resource_bo(grid_size->res);
8199 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8200 lrm.RegisterAddress = GPGPU_DISPATCHDIMX;
8201 lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 0);
8203 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8204 lrm.RegisterAddress = GPGPU_DISPATCHDIMY;
8205 lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 4);
8207 crocus_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
8208 lrm.RegisterAddress = GPGPU_DISPATCHDIMZ;
8209 lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 8);
8213 /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
8214 _crocus_emit_lri(batch, MI_PREDICATE_SRC0 + 4, 0);
8215 crocus_load_register_imm64(batch, MI_PREDICATE_SRC1, 0);
8217 /* Load compute_dispatch_indirect_x_size into SRC0 */
8218 crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 0);
8220 /* predicate = (compute_dispatch_indirect_x_size == 0); */
8221 crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8222 mip.LoadOperation = LOAD_LOAD;
8223 mip.CombineOperation = COMBINE_SET;
8224 mip.CompareOperation = COMPARE_SRCS_EQUAL;
8227 /* Load compute_dispatch_indirect_y_size into SRC0 */
8228 crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 4);
8230 /* predicate = (compute_dispatch_indirect_y_size == 0); */
8231 crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8232 mip.LoadOperation = LOAD_LOAD;
8233 mip.CombineOperation = COMBINE_OR;
8234 mip.CompareOperation = COMPARE_SRCS_EQUAL;
8237 /* Load compute_dispatch_indirect_z_size into SRC0 */
8238 crocus_load_register_mem32(batch, MI_PREDICATE_SRC0, bo, grid_size->offset + 8);
8240 /* predicate = (compute_dispatch_indirect_z_size == 0); */
8241 crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8242 mip.LoadOperation = LOAD_LOAD;
8243 mip.CombineOperation = COMBINE_OR;
8244 mip.CompareOperation = COMPARE_SRCS_EQUAL;
8247 /* predicate = !predicate; */
8248 #define COMPARE_FALSE 1
8249 crocus_emit_cmd(batch, GENX(MI_PREDICATE), mip) {
8250 mip.LoadOperation = LOAD_LOADINV;
8251 mip.CombineOperation = COMBINE_OR;
8252 mip.CompareOperation = COMPARE_FALSE;
8257 crocus_emit_cmd(batch, GENX(GPGPU_WALKER), ggw) {
8258 ggw.IndirectParameterEnable = grid->indirect != NULL;
8259 ggw.PredicateEnable = GFX_VER <= 7 && grid->indirect != NULL;
8260 ggw.SIMDSize = dispatch.simd_size / 16;
8261 ggw.ThreadDepthCounterMaximum = 0;
8262 ggw.ThreadHeightCounterMaximum = 0;
8263 ggw.ThreadWidthCounterMaximum = dispatch.threads - 1;
8264 ggw.ThreadGroupIDXDimension = grid->grid[0];
8265 ggw.ThreadGroupIDYDimension = grid->grid[1];
8266 ggw.ThreadGroupIDZDimension = grid->grid[2];
8267 ggw.RightExecutionMask = dispatch.right_mask;
8268 ggw.BottomExecutionMask = 0xffffffff;
8271 crocus_emit_cmd(batch, GENX(MEDIA_STATE_FLUSH), msf);
8273 batch->contains_draw = true;
8276 #endif /* GFX_VER >= 7 */
8279 * State module teardown.
8282 crocus_destroy_state(struct crocus_context *ice)
8284 pipe_resource_reference(&ice->draw.draw_params.res, NULL);
8285 pipe_resource_reference(&ice->draw.derived_draw_params.res, NULL);
8287 free(ice->state.genx);
8289 for (int i = 0; i < 4; i++) {
8290 pipe_so_target_reference(&ice->state.so_target[i], NULL);
8293 for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) {
8294 pipe_surface_reference(&ice->state.framebuffer.cbufs[i], NULL);
8296 pipe_surface_reference(&ice->state.framebuffer.zsbuf, NULL);
8298 for (int stage = 0; stage < MESA_SHADER_STAGES; stage++) {
8299 struct crocus_shader_state *shs = &ice->state.shaders[stage];
8300 for (int i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
8301 pipe_resource_reference(&shs->constbufs[i].buffer, NULL);
8303 for (int i = 0; i < PIPE_MAX_SHADER_IMAGES; i++) {
8304 pipe_resource_reference(&shs->image[i].base.resource, NULL);
8306 for (int i = 0; i < PIPE_MAX_SHADER_BUFFERS; i++) {
8307 pipe_resource_reference(&shs->ssbo[i].buffer, NULL);
8309 for (int i = 0; i < CROCUS_MAX_TEXTURE_SAMPLERS; i++) {
8310 pipe_sampler_view_reference((struct pipe_sampler_view **)
8311 &shs->textures[i], NULL);
8315 for (int i = 0; i < 16; i++)
8316 pipe_resource_reference(&ice->state.vertex_buffers[i].buffer.resource, NULL);
8317 pipe_resource_reference(&ice->state.grid_size.res, NULL);
8319 pipe_resource_reference(&ice->state.index_buffer.res, NULL);
8322 /* ------------------------------------------------------------------- */
8325 crocus_rebind_buffer(struct crocus_context *ice,
8326 struct crocus_resource *res)
8328 struct pipe_context *ctx = &ice->ctx;
8330 assert(res->base.b.target == PIPE_BUFFER);
8332 /* Buffers can't be framebuffer attachments, nor display related,
8333 * and we don't have upstream Clover support.
8335 assert(!(res->bind_history & (PIPE_BIND_DEPTH_STENCIL |
8336 PIPE_BIND_RENDER_TARGET |
8337 PIPE_BIND_BLENDABLE |
8338 PIPE_BIND_DISPLAY_TARGET |
8340 PIPE_BIND_COMPUTE_RESOURCE |
8341 PIPE_BIND_GLOBAL)));
8343 if (res->bind_history & PIPE_BIND_VERTEX_BUFFER) {
8344 uint64_t bound_vbs = ice->state.bound_vertex_buffers;
8346 const int i = u_bit_scan64(&bound_vbs);
8347 struct pipe_vertex_buffer *buffer = &ice->state.vertex_buffers[i];
8349 if (!buffer->is_user_buffer && &res->base.b == buffer->buffer.resource)
8350 ice->state.dirty |= CROCUS_DIRTY_VERTEX_BUFFERS;
8354 if ((res->bind_history & PIPE_BIND_INDEX_BUFFER) &&
8355 ice->state.index_buffer.res) {
8356 if (res->bo == crocus_resource_bo(ice->state.index_buffer.res))
8357 pipe_resource_reference(&ice->state.index_buffer.res, NULL);
8359 /* There is no need to handle these:
8360 * - PIPE_BIND_COMMAND_ARGS_BUFFER (emitted for every indirect draw)
8361 * - PIPE_BIND_QUERY_BUFFER (no persistent state references)
8364 if (res->bind_history & PIPE_BIND_STREAM_OUTPUT) {
8365 /* XXX: be careful about resetting vs appending... */
8366 for (int i = 0; i < 4; i++) {
8367 if (ice->state.so_target[i] &&
8368 (ice->state.so_target[i]->buffer == &res->base.b)) {
8370 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_GS;
8372 ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
8378 for (int s = MESA_SHADER_VERTEX; s < MESA_SHADER_STAGES; s++) {
8379 struct crocus_shader_state *shs = &ice->state.shaders[s];
8380 enum pipe_shader_type p_stage = stage_to_pipe(s);
8382 if (!(res->bind_stages & (1 << s)))
8385 if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
8386 /* Skip constant buffer 0, it's for regular uniforms, not UBOs */
8387 uint32_t bound_cbufs = shs->bound_cbufs & ~1u;
8388 while (bound_cbufs) {
8389 const int i = u_bit_scan(&bound_cbufs);
8390 struct pipe_constant_buffer *cbuf = &shs->constbufs[i];
8392 if (res->bo == crocus_resource_bo(cbuf->buffer)) {
8393 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS << s;
8398 if (res->bind_history & PIPE_BIND_SHADER_BUFFER) {
8399 uint32_t bound_ssbos = shs->bound_ssbos;
8400 while (bound_ssbos) {
8401 const int i = u_bit_scan(&bound_ssbos);
8402 struct pipe_shader_buffer *ssbo = &shs->ssbo[i];
8404 if (res->bo == crocus_resource_bo(ssbo->buffer)) {
8405 struct pipe_shader_buffer buf = {
8406 .buffer = &res->base.b,
8407 .buffer_offset = ssbo->buffer_offset,
8408 .buffer_size = ssbo->buffer_size,
8410 crocus_set_shader_buffers(ctx, p_stage, i, 1, &buf,
8411 (shs->writable_ssbos >> i) & 1);
8416 if (res->bind_history & PIPE_BIND_SAMPLER_VIEW) {
8417 uint32_t bound_sampler_views = shs->bound_sampler_views;
8418 while (bound_sampler_views) {
8419 const int i = u_bit_scan(&bound_sampler_views);
8420 struct crocus_sampler_view *isv = shs->textures[i];
8421 struct crocus_bo *bo = isv->res->bo;
8423 if (res->bo == bo) {
8424 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s;
8429 if (res->bind_history & PIPE_BIND_SHADER_IMAGE) {
8430 uint32_t bound_image_views = shs->bound_image_views;
8431 while (bound_image_views) {
8432 const int i = u_bit_scan(&bound_image_views);
8433 struct crocus_image_view *iv = &shs->image[i];
8434 struct crocus_bo *bo = crocus_resource_bo(iv->base.resource);
8437 ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_BINDINGS_VS << s;
8443 /* ------------------------------------------------------------------- */
8446 flags_to_post_sync_op(uint32_t flags)
8448 if (flags & PIPE_CONTROL_WRITE_IMMEDIATE)
8449 return WriteImmediateData;
8451 if (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT)
8452 return WritePSDepthCount;
8454 if (flags & PIPE_CONTROL_WRITE_TIMESTAMP)
8455 return WriteTimestamp;
8461 * Do the given flags have a Post Sync or LRI Post Sync operation?
8463 static enum pipe_control_flags
8464 get_post_sync_flags(enum pipe_control_flags flags)
8466 flags &= PIPE_CONTROL_WRITE_IMMEDIATE |
8467 PIPE_CONTROL_WRITE_DEPTH_COUNT |
8468 PIPE_CONTROL_WRITE_TIMESTAMP |
8469 PIPE_CONTROL_LRI_POST_SYNC_OP;
8471 /* Only one "Post Sync Op" is allowed, and it's mutually exclusive with
8472 * "LRI Post Sync Operation". So more than one bit set would be illegal.
8474 assert(util_bitcount(flags) <= 1);
8479 #define IS_COMPUTE_PIPELINE(batch) (batch->name == CROCUS_BATCH_COMPUTE)
8482 * Emit a series of PIPE_CONTROL commands, taking into account any
8483 * workarounds necessary to actually accomplish the caller's request.
8485 * Unless otherwise noted, spec quotations in this function come from:
8487 * Synchronization of the 3D Pipeline > PIPE_CONTROL Command > Programming
8488 * Restrictions for PIPE_CONTROL.
8490 * You should not use this function directly. Use the helpers in
8491 * crocus_pipe_control.c instead, which may split the pipe control further.
8494 crocus_emit_raw_pipe_control(struct crocus_batch *batch,
8497 struct crocus_bo *bo,
8501 UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
8502 enum pipe_control_flags post_sync_flags = get_post_sync_flags(flags);
8503 UNUSED enum pipe_control_flags non_lri_post_sync_flags =
8504 post_sync_flags & ~PIPE_CONTROL_LRI_POST_SYNC_OP;
8506 /* Recursive PIPE_CONTROL workarounds --------------------------------
8507 * (http://knowyourmeme.com/memes/xzibit-yo-dawg)
8509 * We do these first because we want to look at the original operation,
8510 * rather than any workarounds we set.
8513 /* "Flush Types" workarounds ---------------------------------------------
8514 * We do these now because they may add post-sync operations or CS stalls.
8517 if (GFX_VER == 6 && (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH)) {
8518 /* Hardware workaround: SNB B-Spec says:
8520 * "[Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush
8521 * Enable = 1, a PIPE_CONTROL with any non-zero post-sync-op is
8524 crocus_emit_post_sync_nonzero_flush(batch);
8528 if (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) {
8529 /* Project: BDW, SKL+ (stopping at CNL) / Argument: VF Invalidate
8531 * "'Post Sync Operation' must be enabled to 'Write Immediate Data' or
8532 * 'Write PS Depth Count' or 'Write Timestamp'."
8535 flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8536 post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8537 non_lri_post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
8538 bo = batch->ice->workaround_bo;
8539 offset = batch->ice->workaround_offset;
8545 if (flags & PIPE_CONTROL_DEPTH_STALL) {
8546 /* Project: PRE-HSW / Argument: Depth Stall
8548 * "The following bits must be clear:
8549 * - Render Target Cache Flush Enable ([12] of DW1)
8550 * - Depth Cache Flush Enable ([0] of DW1)"
8552 assert(!(flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
8553 PIPE_CONTROL_DEPTH_CACHE_FLUSH)));
8556 if (GFX_VER >= 6 && (flags & PIPE_CONTROL_DEPTH_STALL)) {
8557 /* From the PIPE_CONTROL instruction table, bit 13 (Depth Stall Enable):
8559 * "This bit must be DISABLED for operations other than writing
8562 * This seems like nonsense. An Ivybridge workaround requires us to
8563 * emit a PIPE_CONTROL with a depth stall and write immediate post-sync
8564 * operation. Gen8+ requires us to emit depth stalls and depth cache
8565 * flushes together. So, it's hard to imagine this means anything other
8566 * than "we originally intended this to be used for PS_DEPTH_COUNT".
8568 * We ignore the supposed restriction and do nothing.
8572 if (GFX_VERx10 < 75 && (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) {
8573 /* Project: PRE-HSW / Argument: Depth Cache Flush
8575 * "Depth Stall must be clear ([13] of DW1)."
8577 assert(!(flags & PIPE_CONTROL_DEPTH_STALL));
8580 if (flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
8581 PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
8582 /* From the PIPE_CONTROL instruction table, bit 12 and bit 1:
8584 * "This bit must be DISABLED for End-of-pipe (Read) fences,
8585 * PS_DEPTH_COUNT or TIMESTAMP queries."
8587 * TODO: Implement end-of-pipe checking.
8589 assert(!(post_sync_flags & (PIPE_CONTROL_WRITE_DEPTH_COUNT |
8590 PIPE_CONTROL_WRITE_TIMESTAMP)));
8593 if (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) {
8594 /* From the PIPE_CONTROL instruction table, bit 1:
8596 * "This bit is ignored if Depth Stall Enable is set.
8597 * Further, the render cache is not flushed even if Write Cache
8598 * Flush Enable bit is set."
8600 * We assert that the caller doesn't do this combination, to try and
8601 * prevent mistakes. It shouldn't hurt the GPU, though.
8603 * We skip this check on Gen11+ as the "Stall at Pixel Scoreboard"
8604 * and "Render Target Flush" combo is explicitly required for BTI
8605 * update workarounds.
8607 assert(!(flags & (PIPE_CONTROL_DEPTH_STALL |
8608 PIPE_CONTROL_RENDER_TARGET_FLUSH)));
8611 /* PIPE_CONTROL page workarounds ------------------------------------- */
8613 if (GFX_VER >= 7 && (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE)) {
8614 /* From the PIPE_CONTROL page itself:
8617 * Restriction: Pipe_control with CS-stall bit set must be issued
8618 * before a pipe-control command that has the State Cache
8619 * Invalidate bit set."
8621 flags |= PIPE_CONTROL_CS_STALL;
8624 if ((GFX_VERx10 == 75)) {
8625 /* From the PIPE_CONTROL page itself:
8627 * "HSW - Programming Note: PIPECONTROL with RO Cache Invalidation:
8628 * Prior to programming a PIPECONTROL command with any of the RO
8629 * cache invalidation bit set, program a PIPECONTROL flush command
8630 * with “CS stall” bit and “HDC Flush” bit set."
8632 * TODO: Actually implement this. What's an HDC Flush?
8636 if (flags & PIPE_CONTROL_FLUSH_LLC) {
8637 /* From the PIPE_CONTROL instruction table, bit 26 (Flush LLC):
8640 * SW must always program Post-Sync Operation to "Write Immediate
8641 * Data" when Flush LLC is set."
8643 * For now, we just require the caller to do it.
8645 assert(flags & PIPE_CONTROL_WRITE_IMMEDIATE);
8648 /* "Post-Sync Operation" workarounds -------------------------------- */
8650 /* Project: All / Argument: Global Snapshot Count Reset [19]
8652 * "This bit must not be exercised on any product.
8653 * Requires stall bit ([20] of DW1) set."
8655 * We don't use this, so we just assert that it isn't used. The
8656 * PIPE_CONTROL instruction page indicates that they intended this
8657 * as a debug feature and don't think it is useful in production,
8658 * but it may actually be usable, should we ever want to.
8660 assert((flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) == 0);
8662 if (flags & (PIPE_CONTROL_MEDIA_STATE_CLEAR |
8663 PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE)) {
8664 /* Project: All / Arguments:
8666 * - Generic Media State Clear [16]
8667 * - Indirect State Pointers Disable [16]
8669 * "Requires stall bit ([20] of DW1) set."
8671 * Also, the PIPE_CONTROL instruction table, bit 16 (Generic Media
8672 * State Clear) says:
8674 * "PIPECONTROL command with “Command Streamer Stall Enable” must be
8675 * programmed prior to programming a PIPECONTROL command with "Media
8676 * State Clear" set in GPGPU mode of operation"
8678 * This is a subset of the earlier rule, so there's nothing to do.
8680 flags |= PIPE_CONTROL_CS_STALL;
8683 if (flags & PIPE_CONTROL_STORE_DATA_INDEX) {
8684 /* Project: All / Argument: Store Data Index
8686 * "Post-Sync Operation ([15:14] of DW1) must be set to something other
8689 * For now, we just assert that the caller does this. We might want to
8690 * automatically add a write to the workaround BO...
8692 assert(non_lri_post_sync_flags != 0);
8695 if (flags & PIPE_CONTROL_SYNC_GFDT) {
8696 /* Project: All / Argument: Sync GFDT
8698 * "Post-Sync Operation ([15:14] of DW1) must be set to something other
8699 * than '0' or 0x2520[13] must be set."
8701 * For now, we just assert that the caller does this.
8703 assert(non_lri_post_sync_flags != 0);
8706 if (GFX_VER >= 6 && GFX_VER < 8 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
8707 /* Project: SNB, IVB, HSW / Argument: TLB inv
8709 * "{All SKUs}{All Steppings}: Post-Sync Operation ([15:14] of DW1)
8710 * must be set to something other than '0'."
8712 * For now, we just assert that the caller does this.
8714 assert(non_lri_post_sync_flags != 0);
8717 if (GFX_VER >= 7 && (flags & PIPE_CONTROL_TLB_INVALIDATE)) {
8718 /* Project: IVB+ / Argument: TLB inv
8720 * "Requires stall bit ([20] of DW1) set."
8722 * Also, from the PIPE_CONTROL instruction table:
8725 * Post Sync Operation or CS stall must be set to ensure a TLB
8726 * invalidation occurs. Otherwise no cycle will occur to the TLB
8727 * cache to invalidate."
8729 * This is not a subset of the earlier rule, so there's nothing to do.
8731 flags |= PIPE_CONTROL_CS_STALL;
8734 if (IS_COMPUTE_PIPELINE(batch)) {
8735 if (post_sync_flags ||
8736 (flags & (PIPE_CONTROL_NOTIFY_ENABLE |
8737 PIPE_CONTROL_DEPTH_STALL |
8738 PIPE_CONTROL_RENDER_TARGET_FLUSH |
8739 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
8740 PIPE_CONTROL_DATA_CACHE_FLUSH))) {
8741 /* Project: BDW / Arguments:
8743 * - LRI Post Sync Operation [23]
8744 * - Post Sync Op [15:14]
8746 * - Depth Stall [13]
8747 * - Render Target Cache Flush [12]
8748 * - Depth Cache Flush [0]
8749 * - DC Flush Enable [5]
8751 * "Requires stall bit ([20] of DW) set for all GPGPU and Media
8754 * (The docs have separate table rows for each bit, with essentially
8755 * the same workaround text. We've combined them here.)
8757 flags |= PIPE_CONTROL_CS_STALL;
8759 /* Also, from the PIPE_CONTROL instruction table, bit 20:
8762 * This bit must be always set when PIPE_CONTROL command is
8763 * programmed by GPGPU and MEDIA workloads, except for the cases
8764 * when only Read Only Cache Invalidation bits are set (State
8765 * Cache Invalidation Enable, Instruction cache Invalidation
8766 * Enable, Texture Cache Invalidation Enable, Constant Cache
8767 * Invalidation Enable). This is to WA FFDOP CG issue, this WA
8768 * need not implemented when FF_DOP_CG is disable via "Fixed
8769 * Function DOP Clock Gate Disable" bit in RC_PSMI_CTRL register."
8771 * It sounds like we could avoid CS stalls in some cases, but we
8772 * don't currently bother. This list isn't exactly the list above,
8778 /* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
8780 * "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
8781 * only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
8783 * Note that the kernel does CS stalls between batches, so we only need
8784 * to count them within a batch. We currently naively count every 4, and
8785 * don't skip the ones with only read-cache-invalidate bits set. This
8786 * may or may not be a problem...
8788 if (GFX_VER == 7 && !(GFX_VERx10 == 75)) {
8789 if (flags & PIPE_CONTROL_CS_STALL) {
8790 /* If we're doing a CS stall, reset the counter and carry on. */
8791 batch->pipe_controls_since_last_cs_stall = 0;
8794 /* If this is the fourth pipe control without a CS stall, do one now. */
8795 if (++batch->pipe_controls_since_last_cs_stall == 4) {
8796 batch->pipe_controls_since_last_cs_stall = 0;
8797 flags |= PIPE_CONTROL_CS_STALL;
8801 /* "Stall" workarounds ----------------------------------------------
8802 * These have to come after the earlier ones because we may have added
8803 * some additional CS stalls above.
8806 if (flags & PIPE_CONTROL_CS_STALL) {
8807 /* Project: PRE-SKL, VLV, CHV
8809 * "[All Stepping][All SKUs]:
8811 * One of the following must also be set:
8813 * - Render Target Cache Flush Enable ([12] of DW1)
8814 * - Depth Cache Flush Enable ([0] of DW1)
8815 * - Stall at Pixel Scoreboard ([1] of DW1)
8816 * - Depth Stall ([13] of DW1)
8817 * - Post-Sync Operation ([13] of DW1)
8818 * - DC Flush Enable ([5] of DW1)"
8820 * If we don't already have one of those bits set, we choose to add
8821 * "Stall at Pixel Scoreboard". Some of the other bits require a
8822 * CS stall as a workaround (see above), which would send us into
8823 * an infinite recursion of PIPE_CONTROLs. "Stall at Pixel Scoreboard"
8824 * appears to be safe, so we choose that.
8826 const uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
8827 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
8828 PIPE_CONTROL_WRITE_IMMEDIATE |
8829 PIPE_CONTROL_WRITE_DEPTH_COUNT |
8830 PIPE_CONTROL_WRITE_TIMESTAMP |
8831 PIPE_CONTROL_STALL_AT_SCOREBOARD |
8832 PIPE_CONTROL_DEPTH_STALL |
8833 PIPE_CONTROL_DATA_CACHE_FLUSH;
8834 if (!(flags & wa_bits))
8835 flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
8838 /* Emit --------------------------------------------------------------- */
8840 if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) {
8842 " PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n",
8843 (flags & PIPE_CONTROL_FLUSH_ENABLE) ? "PipeCon " : "",
8844 (flags & PIPE_CONTROL_CS_STALL) ? "CS " : "",
8845 (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) ? "Scoreboard " : "",
8846 (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) ? "VF " : "",
8847 (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) ? "RT " : "",
8848 (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE) ? "Const " : "",
8849 (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE) ? "TC " : "",
8850 (flags & PIPE_CONTROL_DATA_CACHE_FLUSH) ? "DC " : "",
8851 (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH) ? "ZFlush " : "",
8852 (flags & PIPE_CONTROL_DEPTH_STALL) ? "ZStall " : "",
8853 (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE) ? "State " : "",
8854 (flags & PIPE_CONTROL_TLB_INVALIDATE) ? "TLB " : "",
8855 (flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE) ? "Inst " : "",
8856 (flags & PIPE_CONTROL_MEDIA_STATE_CLEAR) ? "MediaClear " : "",
8857 (flags & PIPE_CONTROL_NOTIFY_ENABLE) ? "Notify " : "",
8858 (flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) ?
8860 (flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE) ?
8862 (flags & PIPE_CONTROL_WRITE_IMMEDIATE) ? "WriteImm " : "",
8863 (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT) ? "WriteZCount " : "",
8864 (flags & PIPE_CONTROL_WRITE_TIMESTAMP) ? "WriteTimestamp " : "",
8868 crocus_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
8870 pc.LRIPostSyncOperation = NoLRIOperation;
8871 pc.PipeControlFlushEnable = flags & PIPE_CONTROL_FLUSH_ENABLE;
8872 pc.DCFlushEnable = flags & PIPE_CONTROL_DATA_CACHE_FLUSH;
8875 pc.StoreDataIndex = 0;
8876 pc.CommandStreamerStallEnable = flags & PIPE_CONTROL_CS_STALL;
8877 pc.GlobalSnapshotCountReset =
8878 flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET;
8879 pc.TLBInvalidate = flags & PIPE_CONTROL_TLB_INVALIDATE;
8880 pc.GenericMediaStateClear = flags & PIPE_CONTROL_MEDIA_STATE_CLEAR;
8881 pc.StallAtPixelScoreboard = flags & PIPE_CONTROL_STALL_AT_SCOREBOARD;
8882 pc.RenderTargetCacheFlushEnable =
8883 flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
8884 pc.DepthCacheFlushEnable = flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH;
8885 pc.StateCacheInvalidationEnable =
8886 flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE;
8887 pc.VFCacheInvalidationEnable = flags & PIPE_CONTROL_VF_CACHE_INVALIDATE;
8888 pc.ConstantCacheInvalidationEnable =
8889 flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE;
8891 pc.WriteCacheFlush = flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
8893 pc.PostSyncOperation = flags_to_post_sync_op(flags);
8894 pc.DepthStallEnable = flags & PIPE_CONTROL_DEPTH_STALL;
8895 pc.InstructionCacheInvalidateEnable =
8896 flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE;
8897 pc.NotifyEnable = flags & PIPE_CONTROL_NOTIFY_ENABLE;
8898 #if GFX_VER >= 5 || GFX_VERx10 == 45
8899 pc.IndirectStatePointersDisable =
8900 flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE;
8903 pc.TextureCacheInvalidationEnable =
8904 flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
8905 #elif GFX_VER == 5 || GFX_VERx10 == 45
8906 pc.TextureCacheFlushEnable =
8907 flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
8909 pc.Address = ggtt_bo(bo, offset);
8910 if (GFX_VER < 7 && bo)
8911 pc.DestinationAddressType = DAT_GGTT;
8912 pc.ImmediateData = imm;
8918 genX(crocus_upload_urb)(struct crocus_batch *batch,
8923 struct crocus_context *ice = batch->ice;
8924 int nr_vs_entries, nr_gs_entries;
8925 int total_urb_size = ice->urb.size * 1024; /* in bytes */
8926 const struct intel_device_info *devinfo = &batch->screen->devinfo;
8928 /* Calculate how many entries fit in each stage's section of the URB */
8930 nr_vs_entries = (total_urb_size/2) / (vs_size * 128);
8931 nr_gs_entries = (total_urb_size/2) / (gs_size * 128);
8933 nr_vs_entries = total_urb_size / (vs_size * 128);
8937 /* Then clamp to the maximum allowed by the hardware */
8938 if (nr_vs_entries > devinfo->urb.max_entries[MESA_SHADER_VERTEX])
8939 nr_vs_entries = devinfo->urb.max_entries[MESA_SHADER_VERTEX];
8941 if (nr_gs_entries > devinfo->urb.max_entries[MESA_SHADER_GEOMETRY])
8942 nr_gs_entries = devinfo->urb.max_entries[MESA_SHADER_GEOMETRY];
8944 /* Finally, both must be a multiple of 4 (see 3DSTATE_URB in the PRM). */
8945 ice->urb.nr_vs_entries = ROUND_DOWN_TO(nr_vs_entries, 4);
8946 ice->urb.nr_gs_entries = ROUND_DOWN_TO(nr_gs_entries, 4);
8948 assert(ice->urb.nr_vs_entries >=
8949 devinfo->urb.min_entries[MESA_SHADER_VERTEX]);
8950 assert(ice->urb.nr_vs_entries % 4 == 0);
8951 assert(ice->urb.nr_gs_entries % 4 == 0);
8952 assert(vs_size <= 5);
8953 assert(gs_size <= 5);
8955 crocus_emit_cmd(batch, GENX(3DSTATE_URB), urb) {
8956 urb.VSNumberofURBEntries = ice->urb.nr_vs_entries;
8957 urb.VSURBEntryAllocationSize = vs_size - 1;
8959 urb.GSNumberofURBEntries = ice->urb.nr_gs_entries;
8960 urb.GSURBEntryAllocationSize = gs_size - 1;
8962 /* From the PRM Volume 2 part 1, section 1.4.7:
8964 * Because of a urb corruption caused by allocating a previous gsunit’s
8965 * urb entry to vsunit software is required to send a "GS NULL
8966 * Fence"(Send URB fence with VS URB size == 1 and GS URB size == 0) plus
8967 * a dummy DRAW call before any case where VS will be taking over GS URB
8970 * It is not clear exactly what this means ("URB fence" is a command that
8971 * doesn't exist on Gen6). So for now we just do a full pipeline flush as
8974 if (ice->urb.gs_present && !gs_present)
8975 crocus_emit_mi_flush(batch);
8976 ice->urb.gs_present = gs_present;
8981 crocus_lost_genx_state(struct crocus_context *ice, struct crocus_batch *batch)
8986 crocus_emit_mi_report_perf_count(struct crocus_batch *batch,
8987 struct crocus_bo *bo,
8988 uint32_t offset_in_bytes,
8992 crocus_emit_cmd(batch, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
8993 mi_rpc.MemoryAddress = rw_bo(bo, offset_in_bytes);
8994 mi_rpc.ReportID = report_id;
9000 * From the PRM, Volume 2a:
9002 * "Indirect State Pointers Disable
9004 * At the completion of the post-sync operation associated with this pipe
9005 * control packet, the indirect state pointers in the hardware are
9006 * considered invalid; the indirect pointers are not saved in the context.
9007 * If any new indirect state commands are executed in the command stream
9008 * while the pipe control is pending, the new indirect state commands are
9011 * [DevIVB+]: Using Invalidate State Pointer (ISP) only inhibits context
9012 * restoring of Push Constant (3DSTATE_CONSTANT_*) commands. Push Constant
9013 * commands are only considered as Indirect State Pointers. Once ISP is
9014 * issued in a context, SW must initialize by programming push constant
9015 * commands for all the shaders (at least to zero length) before attempting
9016 * any rendering operation for the same context."
9018 * 3DSTATE_CONSTANT_* packets are restored during a context restore,
9019 * even though they point to a BO that has been already unreferenced at
9020 * the end of the previous batch buffer. This has been fine so far since
9021 * we are protected by these scratch page (every address not covered by
9022 * a BO should be pointing to the scratch page). But on CNL, it is
9023 * causing a GPU hang during context restore at the 3DSTATE_CONSTANT_*
9026 * The flag "Indirect State Pointers Disable" in PIPE_CONTROL tells the
9027 * hardware to ignore previous 3DSTATE_CONSTANT_* packets during a
9028 * context restore, so the mentioned hang doesn't happen. However,
9029 * software must program push constant commands for all stages prior to
9030 * rendering anything, so we flag them as dirty.
9032 * Finally, we also make sure to stall at pixel scoreboard to make sure the
9033 * constants have been loaded into the EUs prior to disable the push constants
9034 * so that it doesn't hang a previous 3DPRIMITIVE.
9038 gen7_emit_isp_disable(struct crocus_batch *batch)
9040 crocus_emit_raw_pipe_control(batch, "isp disable",
9041 PIPE_CONTROL_STALL_AT_SCOREBOARD |
9042 PIPE_CONTROL_CS_STALL,
9044 crocus_emit_raw_pipe_control(batch, "isp disable",
9045 PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE |
9046 PIPE_CONTROL_CS_STALL,
9049 struct crocus_context *ice = batch->ice;
9050 ice->state.stage_dirty |= (CROCUS_STAGE_DIRTY_CONSTANTS_VS |
9051 CROCUS_STAGE_DIRTY_CONSTANTS_TCS |
9052 CROCUS_STAGE_DIRTY_CONSTANTS_TES |
9053 CROCUS_STAGE_DIRTY_CONSTANTS_GS |
9054 CROCUS_STAGE_DIRTY_CONSTANTS_FS);
9060 crocus_state_finish_batch(struct crocus_batch *batch)
9062 #if GFX_VERx10 == 75
9063 if (batch->name == CROCUS_BATCH_RENDER) {
9064 crocus_emit_mi_flush(batch);
9065 crocus_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
9066 ptr.ColorCalcStatePointer = batch->ice->shaders.cc_offset;
9069 crocus_emit_pipe_control_flush(batch, "hsw wa", PIPE_CONTROL_RENDER_TARGET_FLUSH |
9070 PIPE_CONTROL_CS_STALL);
9073 gen7_emit_isp_disable(batch);
9078 crocus_batch_reset_dirty(struct crocus_batch *batch)
9080 /* unreference any index buffer so it get reemitted. */
9081 pipe_resource_reference(&batch->ice->state.index_buffer.res, NULL);
9083 /* for GEN4/5 need to reemit anything that ends up in the state batch that points to anything in the state batch
9084 * as the old state batch won't still be available.
9086 batch->ice->state.dirty |= CROCUS_DIRTY_DEPTH_BUFFER |
9087 CROCUS_DIRTY_COLOR_CALC_STATE;
9089 batch->ice->state.dirty |= CROCUS_DIRTY_VERTEX_ELEMENTS | CROCUS_DIRTY_VERTEX_BUFFERS;
9091 batch->ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_BINDINGS;
9092 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_VS;
9093 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TES;
9094 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_TCS;
9095 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_GS;
9096 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_PS;
9097 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_SAMPLER_STATES_CS;
9099 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_VS;
9100 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TES;
9101 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_TCS;
9102 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_GS;
9103 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_FS;
9104 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CONSTANTS_CS;
9106 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_VS;
9107 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_GS;
9108 batch->ice->state.stage_dirty |= CROCUS_STAGE_DIRTY_CS;
9109 batch->ice->state.dirty |= CROCUS_DIRTY_CC_VIEWPORT | CROCUS_DIRTY_SF_CL_VIEWPORT;
9113 batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_BLEND_STATE;
9114 batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_SCISSOR_RECT;
9115 batch->ice->state.dirty |= CROCUS_DIRTY_GEN6_WM_DEPTH_STENCIL;
9119 /* dirty the SF state on gen4/5 */
9120 batch->ice->state.dirty |= CROCUS_DIRTY_RASTER;
9121 batch->ice->state.dirty |= CROCUS_DIRTY_GEN4_CURBE;
9122 batch->ice->state.dirty |= CROCUS_DIRTY_CLIP;
9123 batch->ice->state.dirty |= CROCUS_DIRTY_WM;
9126 /* Streamout dirty */
9127 batch->ice->state.dirty |= CROCUS_DIRTY_STREAMOUT;
9128 batch->ice->state.dirty |= CROCUS_DIRTY_SO_DECL_LIST;
9129 batch->ice->state.dirty |= CROCUS_DIRTY_GEN7_SO_BUFFERS;
9133 #if GFX_VERx10 == 75
9134 struct pipe_rasterizer_state *crocus_get_rast_state(struct crocus_context *ice)
9136 return &ice->state.cso_rast->cso;
9141 static void update_so_strides(struct crocus_context *ice,
9144 for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
9145 struct crocus_stream_output_target *so = (void *)ice->state.so_target[i];
9147 so->stride = strides[i] * sizeof(uint32_t);
9152 static void crocus_fill_clamp_mask(const struct crocus_sampler_state *samp,
9154 uint32_t *clamp_mask)
9157 if (samp->pstate.min_img_filter != PIPE_TEX_FILTER_NEAREST &&
9158 samp->pstate.mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
9159 if (samp->pstate.wrap_s == PIPE_TEX_WRAP_CLAMP)
9160 clamp_mask[0] |= (1 << s);
9161 if (samp->pstate.wrap_t == PIPE_TEX_WRAP_CLAMP)
9162 clamp_mask[1] |= (1 << s);
9163 if (samp->pstate.wrap_r == PIPE_TEX_WRAP_CLAMP)
9164 clamp_mask[2] |= (1 << s);
9170 crocus_set_frontend_noop(struct pipe_context *ctx, bool enable)
9172 struct crocus_context *ice = (struct crocus_context *) ctx;
9174 if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_RENDER], enable)) {
9175 ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_RENDER;
9176 ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_RENDER;
9179 if (ice->batch_count == 1)
9182 if (crocus_batch_prepare_noop(&ice->batches[CROCUS_BATCH_COMPUTE], enable)) {
9183 ice->state.dirty |= CROCUS_ALL_DIRTY_FOR_COMPUTE;
9184 ice->state.stage_dirty |= CROCUS_ALL_STAGE_DIRTY_FOR_COMPUTE;
9189 genX(crocus_init_screen_state)(struct crocus_screen *screen)
9191 assert(screen->devinfo.verx10 == GFX_VERx10);
9192 assert(screen->devinfo.ver == GFX_VER);
9193 screen->vtbl.destroy_state = crocus_destroy_state;
9194 screen->vtbl.init_render_context = crocus_init_render_context;
9195 screen->vtbl.upload_render_state = crocus_upload_render_state;
9197 screen->vtbl.init_compute_context = crocus_init_compute_context;
9198 screen->vtbl.upload_compute_state = crocus_upload_compute_state;
9200 screen->vtbl.emit_raw_pipe_control = crocus_emit_raw_pipe_control;
9201 screen->vtbl.emit_mi_report_perf_count = crocus_emit_mi_report_perf_count;
9202 screen->vtbl.rebind_buffer = crocus_rebind_buffer;
9203 #if GFX_VERx10 >= 75
9204 screen->vtbl.load_register_reg32 = crocus_load_register_reg32;
9205 screen->vtbl.load_register_reg64 = crocus_load_register_reg64;
9206 screen->vtbl.load_register_imm32 = crocus_load_register_imm32;
9207 screen->vtbl.load_register_imm64 = crocus_load_register_imm64;
9208 screen->vtbl.store_data_imm32 = crocus_store_data_imm32;
9209 screen->vtbl.store_data_imm64 = crocus_store_data_imm64;
9212 screen->vtbl.load_register_mem32 = crocus_load_register_mem32;
9213 screen->vtbl.load_register_mem64 = crocus_load_register_mem64;
9214 screen->vtbl.copy_mem_mem = crocus_copy_mem_mem;
9215 screen->vtbl.create_so_decl_list = crocus_create_so_decl_list;
9217 screen->vtbl.update_surface_base_address = crocus_update_surface_base_address;
9219 screen->vtbl.store_register_mem32 = crocus_store_register_mem32;
9220 screen->vtbl.store_register_mem64 = crocus_store_register_mem64;
9222 screen->vtbl.populate_vs_key = crocus_populate_vs_key;
9223 screen->vtbl.populate_tcs_key = crocus_populate_tcs_key;
9224 screen->vtbl.populate_tes_key = crocus_populate_tes_key;
9225 screen->vtbl.populate_gs_key = crocus_populate_gs_key;
9226 screen->vtbl.populate_fs_key = crocus_populate_fs_key;
9227 screen->vtbl.populate_cs_key = crocus_populate_cs_key;
9228 screen->vtbl.lost_genx_state = crocus_lost_genx_state;
9230 screen->vtbl.finish_batch = crocus_state_finish_batch;
9233 screen->vtbl.upload_urb_fence = crocus_upload_urb_fence;
9234 screen->vtbl.calculate_urb_fence = crocus_calculate_urb_fence;
9236 screen->vtbl.fill_clamp_mask = crocus_fill_clamp_mask;
9237 screen->vtbl.batch_reset_dirty = crocus_batch_reset_dirty;
9238 screen->vtbl.translate_prim_type = translate_prim_type;
9240 screen->vtbl.update_so_strides = update_so_strides;
9241 screen->vtbl.get_so_offset = crocus_get_so_offset;
9244 genX(crocus_init_blt)(screen);
9248 genX(crocus_init_state)(struct crocus_context *ice)
9250 struct pipe_context *ctx = &ice->ctx;
9252 ctx->create_blend_state = crocus_create_blend_state;
9253 ctx->create_depth_stencil_alpha_state = crocus_create_zsa_state;
9254 ctx->create_rasterizer_state = crocus_create_rasterizer_state;
9255 ctx->create_sampler_state = crocus_create_sampler_state;
9256 ctx->create_sampler_view = crocus_create_sampler_view;
9257 ctx->create_surface = crocus_create_surface;
9258 ctx->create_vertex_elements_state = crocus_create_vertex_elements;
9259 ctx->bind_blend_state = crocus_bind_blend_state;
9260 ctx->bind_depth_stencil_alpha_state = crocus_bind_zsa_state;
9261 ctx->bind_sampler_states = crocus_bind_sampler_states;
9262 ctx->bind_rasterizer_state = crocus_bind_rasterizer_state;
9263 ctx->bind_vertex_elements_state = crocus_bind_vertex_elements_state;
9264 ctx->delete_blend_state = crocus_delete_state;
9265 ctx->delete_depth_stencil_alpha_state = crocus_delete_state;
9266 ctx->delete_rasterizer_state = crocus_delete_state;
9267 ctx->delete_sampler_state = crocus_delete_state;
9268 ctx->delete_vertex_elements_state = crocus_delete_state;
9269 ctx->set_blend_color = crocus_set_blend_color;
9270 ctx->set_clip_state = crocus_set_clip_state;
9271 ctx->set_constant_buffer = crocus_set_constant_buffer;
9272 ctx->set_shader_buffers = crocus_set_shader_buffers;
9273 ctx->set_shader_images = crocus_set_shader_images;
9274 ctx->set_sampler_views = crocus_set_sampler_views;
9275 ctx->set_tess_state = crocus_set_tess_state;
9276 ctx->set_patch_vertices = crocus_set_patch_vertices;
9277 ctx->set_framebuffer_state = crocus_set_framebuffer_state;
9278 ctx->set_polygon_stipple = crocus_set_polygon_stipple;
9279 ctx->set_sample_mask = crocus_set_sample_mask;
9280 ctx->set_scissor_states = crocus_set_scissor_states;
9281 ctx->set_stencil_ref = crocus_set_stencil_ref;
9282 ctx->set_vertex_buffers = crocus_set_vertex_buffers;
9283 ctx->set_viewport_states = crocus_set_viewport_states;
9284 ctx->sampler_view_destroy = crocus_sampler_view_destroy;
9285 ctx->surface_destroy = crocus_surface_destroy;
9286 ctx->draw_vbo = crocus_draw_vbo;
9287 ctx->launch_grid = crocus_launch_grid;
9289 ctx->set_frontend_noop = crocus_set_frontend_noop;
9292 ctx->create_stream_output_target = crocus_create_stream_output_target;
9293 ctx->stream_output_target_destroy = crocus_stream_output_target_destroy;
9294 ctx->set_stream_output_targets = crocus_set_stream_output_targets;
9297 ice->state.dirty = ~0ull;
9298 ice->state.stage_dirty = ~0ull;
9300 ice->state.statistics_counters_enabled = true;
9302 ice->state.sample_mask = 0xff;
9303 ice->state.num_viewports = 1;
9304 ice->state.prim_mode = PIPE_PRIM_MAX;
9305 ice->state.reduced_prim_mode = PIPE_PRIM_MAX;
9306 ice->state.genx = calloc(1, sizeof(struct crocus_genx_state));
9307 ice->draw.derived_params.drawid = -1;
9309 /* Default all scissor rectangles to be empty regions. */
9310 for (int i = 0; i < CROCUS_MAX_VIEWPORTS; i++) {
9311 ice->state.scissors[i] = (struct pipe_scissor_state) {
9312 .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,