There are many issues with SDMA across many generations of hardware.
A recent example is that gfx10.3 suffers from random GPU hangs if
userspace uses SDMA.
Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7908>
``AMD_DEBUG``
a comma-separated list of named flags, which do various things:
-``nodma``
- Disable SDMA
-``nodmaclear``
- Disable SDMA clears
-``nodmacopyimage``
- Disable SDMA image copies
``nodcc``
Disable DCC.
``nodccclear``
Use old-style monolithic shaders compiled on demand
``nooptvariant``
Disable compiling optimized shader variants.
-``forcedma``
- Use SDMA for all operations when possible.
``nowc``
Disable GTT write combining
``check_vm``
void si_vid_clear_buffer(struct pipe_context *context, struct rvid_buffer *buffer)
{
struct si_context *sctx = (struct si_context *)context;
+ uint32_t zero = 0;
- si_sdma_clear_buffer(sctx, &buffer->res->b.b, 0, buffer->res->b.b.width0, 0);
+ sctx->b.clear_buffer(&sctx->b, &buffer->res->b.b, 0, buffer->res->b.b.width0, &zero, 4);
context->flush(context, NULL, 0);
}
C_SOURCES := \
- cik_sdma.c \
driinfo_radeonsi.h \
gfx10_query.c \
gfx10_shader_ngg.c \
si_cp_reg_shadowing.c \
si_debug.c \
si_descriptors.c \
- si_dma_cs.c \
si_fence.c \
si_get.c \
si_gfx_cs.c \
+++ /dev/null
-/*
- * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
- * Copyright 2015 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include "si_pipe.h"
-#include "sid.h"
-
-static unsigned minify_as_blocks(unsigned width, unsigned level, unsigned blk_w)
-{
- width = u_minify(width, level);
- return DIV_ROUND_UP(width, blk_w);
-}
-
-static unsigned encode_tile_info(struct si_context *sctx, struct si_texture *tex, unsigned level,
- bool set_bpp)
-{
- struct radeon_info *info = &sctx->screen->info;
- unsigned tile_index = tex->surface.u.legacy.tiling_index[level];
- unsigned macro_tile_index = tex->surface.u.legacy.macro_tile_index;
- unsigned tile_mode = info->si_tile_mode_array[tile_index];
- unsigned macro_tile_mode = info->cik_macrotile_mode_array[macro_tile_index];
-
- return (set_bpp ? util_logbase2(tex->surface.bpe) : 0) | (G_009910_ARRAY_MODE(tile_mode) << 3) |
- (G_009910_MICRO_TILE_MODE_NEW(tile_mode) << 8) |
- /* Non-depth modes don't have TILE_SPLIT set. */
- ((util_logbase2(tex->surface.u.legacy.tile_split >> 6)) << 11) |
- (G_009990_BANK_WIDTH(macro_tile_mode) << 15) |
- (G_009990_BANK_HEIGHT(macro_tile_mode) << 18) |
- (G_009990_NUM_BANKS(macro_tile_mode) << 21) |
- (G_009990_MACRO_TILE_ASPECT(macro_tile_mode) << 24) |
- (G_009910_PIPE_CONFIG(tile_mode) << 26);
-}
-
-static bool si_sdma_v4_copy_texture(struct si_context *sctx, struct pipe_resource *dst,
- unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz,
- struct pipe_resource *src, unsigned src_level,
- const struct pipe_box *src_box)
-{
- struct si_texture *ssrc = (struct si_texture *)src;
- struct si_texture *sdst = (struct si_texture *)dst;
-
- unsigned bpp = sdst->surface.bpe;
- uint64_t dst_address = sdst->buffer.gpu_address + sdst->surface.u.gfx9.surf_offset;
- uint64_t src_address = ssrc->buffer.gpu_address + ssrc->surface.u.gfx9.surf_offset;
- unsigned dst_pitch = sdst->surface.u.gfx9.surf_pitch;
- unsigned src_pitch = ssrc->surface.u.gfx9.surf_pitch;
- uint64_t dst_slice_pitch = ((uint64_t)sdst->surface.u.gfx9.surf_slice_size) / bpp;
- uint64_t src_slice_pitch = ((uint64_t)ssrc->surface.u.gfx9.surf_slice_size) / bpp;
- unsigned srcx = src_box->x / ssrc->surface.blk_w;
- unsigned srcy = src_box->y / ssrc->surface.blk_h;
- unsigned srcz = src_box->z;
- unsigned copy_width = DIV_ROUND_UP(src_box->width, ssrc->surface.blk_w);
- unsigned copy_height = DIV_ROUND_UP(src_box->height, ssrc->surface.blk_h);
- unsigned copy_depth = src_box->depth;
- unsigned xalign = MAX2(1, 4 / bpp);
-
- assert(src_level <= src->last_level);
- assert(dst_level <= dst->last_level);
- assert(sdst->surface.u.gfx9.surf_offset + dst_slice_pitch * bpp * (dstz + src_box->depth) <=
- sdst->buffer.buf->size);
- assert(ssrc->surface.u.gfx9.surf_offset + src_slice_pitch * bpp * (srcz + src_box->depth) <=
- ssrc->buffer.buf->size);
-
- if (!si_prepare_for_dma_blit(sctx, sdst, dst_level, dstx, dsty, dstz, ssrc, src_level, src_box))
- return false;
-
- dstx /= sdst->surface.blk_w;
- dsty /= sdst->surface.blk_h;
-
- if (srcx >= (1 << 14) || srcy >= (1 << 14) || srcz >= (1 << 11) || dstx >= (1 << 14) ||
- dsty >= (1 << 14) || dstz >= (1 << 11))
- return false;
-
- /* Linear -> linear sub-window copy. */
- if (ssrc->surface.is_linear && sdst->surface.is_linear) {
- struct radeon_cmdbuf *cs = &sctx->sdma_cs;
-
- /* Check if everything fits into the bitfields */
- if (!(src_pitch <= (1 << 19) && dst_pitch <= (1 << 19) && src_slice_pitch <= (1 << 28) &&
- dst_slice_pitch <= (1 << 28) && copy_width <= (1 << 14) && copy_height <= (1 << 14) &&
- copy_depth <= (1 << 11)))
- return false;
-
- si_need_dma_space(sctx, 13, &sdst->buffer, &ssrc->buffer);
-
- src_address += ssrc->surface.u.gfx9.offset[src_level];
- dst_address += sdst->surface.u.gfx9.offset[dst_level];
-
- /* Check alignments */
- if ((src_address % 4) != 0 || (dst_address % 4) != 0 || (src_pitch % xalign) != 0)
- return false;
-
- radeon_emit(
- cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW,
- sctx->ws->cs_is_secure(cs) ? (1u << 2) : 0) |
- (util_logbase2(bpp) << 29));
- radeon_emit(cs, src_address);
- radeon_emit(cs, src_address >> 32);
- radeon_emit(cs, srcx | (srcy << 16));
- radeon_emit(cs, srcz | ((src_pitch - 1) << 13));
- radeon_emit(cs, src_slice_pitch - 1);
- radeon_emit(cs, dst_address);
- radeon_emit(cs, dst_address >> 32);
- radeon_emit(cs, dstx | (dsty << 16));
- radeon_emit(cs, dstz | ((dst_pitch - 1) << 13));
- radeon_emit(cs, dst_slice_pitch - 1);
- radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16));
- radeon_emit(cs, (copy_depth - 1));
- return true;
- }
-
- /* Linear <-> Tiled sub-window copy */
- if (ssrc->surface.is_linear != sdst->surface.is_linear) {
- struct si_texture *tiled = ssrc->surface.is_linear ? sdst : ssrc;
- struct si_texture *linear = tiled == ssrc ? sdst : ssrc;
- unsigned tiled_level = tiled == ssrc ? src_level : dst_level;
- unsigned linear_level = linear == ssrc ? src_level : dst_level;
- unsigned tiled_x = tiled == ssrc ? srcx : dstx;
- unsigned linear_x = linear == ssrc ? srcx : dstx;
- unsigned tiled_y = tiled == ssrc ? srcy : dsty;
- unsigned linear_y = linear == ssrc ? srcy : dsty;
- unsigned tiled_z = tiled == ssrc ? srcz : dstz;
- unsigned linear_z = linear == ssrc ? srcz : dstz;
- unsigned tiled_width = tiled == ssrc
- ? DIV_ROUND_UP(ssrc->buffer.b.b.width0, ssrc->surface.blk_w)
- : DIV_ROUND_UP(sdst->buffer.b.b.width0, sdst->surface.blk_w);
- unsigned tiled_height = tiled == ssrc
- ? DIV_ROUND_UP(ssrc->buffer.b.b.height0, ssrc->surface.blk_h)
- : DIV_ROUND_UP(sdst->buffer.b.b.height0, sdst->surface.blk_h);
- unsigned tiled_depth = tiled == ssrc ? ssrc->buffer.b.b.depth0 : sdst->buffer.b.b.depth0;
- unsigned linear_pitch = linear == ssrc ? src_pitch : dst_pitch;
- unsigned linear_slice_pitch = linear == ssrc ? src_slice_pitch : dst_slice_pitch;
- uint64_t tiled_address = tiled == ssrc ? src_address : dst_address;
- uint64_t linear_address = linear == ssrc ? src_address : dst_address;
- struct radeon_cmdbuf *cs = &sctx->sdma_cs;
-
- linear_address += linear->surface.u.gfx9.offset[linear_level];
-
- /* Check if everything fits into the bitfields */
- if (!(tiled_x <= (1 << 14) && tiled_y <= (1 << 14) && tiled_z <= (1 << 11) &&
- tiled_width <= (1 << 14) && tiled_height <= (1 << 14) && tiled_depth <= (1 << 11) &&
- linear_x <= (1 << 14) && linear_y <= (1 << 14) && linear_z <= (1 << 11) &&
- linear_pitch <= (1 << 14) && linear_slice_pitch <= (1 << 28) &&
- copy_width <= (1 << 14) && copy_height <= (1 << 14) && copy_depth <= (1 << 11)))
- return false;
-
- /* Check alignments */
- if ((tiled_address % 256 != 0) || (linear_address % 4 != 0) || (linear_pitch % xalign != 0) ||
- (linear_slice_pitch % xalign != 0))
- return false;
-
- si_need_dma_space(sctx, 14, &sdst->buffer, &ssrc->buffer);
-
- radeon_emit(
- cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW,
- sctx->ws->cs_is_secure(cs) ? (1u << 2) : 0) |
- tiled->buffer.b.b.last_level << 20 | tiled_level << 24 |
- (linear == sdst ? 1u : 0) << 31);
- radeon_emit(cs, (uint32_t)tiled_address);
- radeon_emit(cs, (uint32_t)(tiled_address >> 32));
- radeon_emit(cs, tiled_x | (tiled_y << 16));
- radeon_emit(cs, tiled_z | ((tiled_width - 1) << 16));
- radeon_emit(cs, (tiled_height - 1) | (tiled_depth - 1) << 16);
- radeon_emit(cs, util_logbase2(bpp) | tiled->surface.u.gfx9.surf.swizzle_mode << 3 |
- tiled->surface.u.gfx9.resource_type << 9 |
- tiled->surface.u.gfx9.surf.epitch << 16);
- radeon_emit(cs, (uint32_t)linear_address);
- radeon_emit(cs, (uint32_t)(linear_address >> 32));
- radeon_emit(cs, linear_x | (linear_y << 16));
- radeon_emit(cs, linear_z | ((linear_pitch - 1) << 16));
- radeon_emit(cs, linear_slice_pitch - 1);
- radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16));
- radeon_emit(cs, (copy_depth - 1));
- return true;
- }
-
- return false;
-}
-
-static bool cik_sdma_copy_texture(struct si_context *sctx, struct pipe_resource *dst,
- unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz,
- struct pipe_resource *src, unsigned src_level,
- const struct pipe_box *src_box)
-{
- struct radeon_info *info = &sctx->screen->info;
- struct si_texture *ssrc = (struct si_texture *)src;
- struct si_texture *sdst = (struct si_texture *)dst;
- unsigned bpp = sdst->surface.bpe;
- uint64_t dst_address = sdst->buffer.gpu_address + sdst->surface.u.legacy.level[dst_level].offset;
- uint64_t src_address = ssrc->buffer.gpu_address + ssrc->surface.u.legacy.level[src_level].offset;
- unsigned dst_mode = sdst->surface.u.legacy.level[dst_level].mode;
- unsigned src_mode = ssrc->surface.u.legacy.level[src_level].mode;
- unsigned dst_tile_index = sdst->surface.u.legacy.tiling_index[dst_level];
- unsigned src_tile_index = ssrc->surface.u.legacy.tiling_index[src_level];
- unsigned dst_tile_mode = info->si_tile_mode_array[dst_tile_index];
- unsigned src_tile_mode = info->si_tile_mode_array[src_tile_index];
- unsigned dst_micro_mode = G_009910_MICRO_TILE_MODE_NEW(dst_tile_mode);
- unsigned src_micro_mode = G_009910_MICRO_TILE_MODE_NEW(src_tile_mode);
- unsigned dst_tile_swizzle = dst_mode == RADEON_SURF_MODE_2D ? sdst->surface.tile_swizzle : 0;
- unsigned src_tile_swizzle = src_mode == RADEON_SURF_MODE_2D ? ssrc->surface.tile_swizzle : 0;
- unsigned dst_pitch = sdst->surface.u.legacy.level[dst_level].nblk_x;
- unsigned src_pitch = ssrc->surface.u.legacy.level[src_level].nblk_x;
- uint64_t dst_slice_pitch =
- ((uint64_t)sdst->surface.u.legacy.level[dst_level].slice_size_dw * 4) / bpp;
- uint64_t src_slice_pitch =
- ((uint64_t)ssrc->surface.u.legacy.level[src_level].slice_size_dw * 4) / bpp;
- unsigned dst_width = minify_as_blocks(sdst->buffer.b.b.width0, dst_level, sdst->surface.blk_w);
- unsigned src_width = minify_as_blocks(ssrc->buffer.b.b.width0, src_level, ssrc->surface.blk_w);
- unsigned dst_height = minify_as_blocks(sdst->buffer.b.b.height0, dst_level, sdst->surface.blk_h);
- unsigned src_height = minify_as_blocks(ssrc->buffer.b.b.height0, src_level, ssrc->surface.blk_h);
- unsigned srcx = src_box->x / ssrc->surface.blk_w;
- unsigned srcy = src_box->y / ssrc->surface.blk_h;
- unsigned srcz = src_box->z;
- unsigned copy_width = DIV_ROUND_UP(src_box->width, ssrc->surface.blk_w);
- unsigned copy_height = DIV_ROUND_UP(src_box->height, ssrc->surface.blk_h);
- unsigned copy_depth = src_box->depth;
-
- assert(src_level <= src->last_level);
- assert(dst_level <= dst->last_level);
- assert(sdst->surface.u.legacy.level[dst_level].offset +
- dst_slice_pitch * bpp * (dstz + src_box->depth) <=
- sdst->buffer.buf->size);
- assert(ssrc->surface.u.legacy.level[src_level].offset +
- src_slice_pitch * bpp * (srcz + src_box->depth) <=
- ssrc->buffer.buf->size);
-
- if (!si_prepare_for_dma_blit(sctx, sdst, dst_level, dstx, dsty, dstz, ssrc, src_level, src_box))
- return false;
-
- dstx /= sdst->surface.blk_w;
- dsty /= sdst->surface.blk_h;
-
- if (srcx >= (1 << 14) || srcy >= (1 << 14) || srcz >= (1 << 11) || dstx >= (1 << 14) ||
- dsty >= (1 << 14) || dstz >= (1 << 11))
- return false;
-
- dst_address |= dst_tile_swizzle << 8;
- src_address |= src_tile_swizzle << 8;
-
- /* Linear -> linear sub-window copy. */
- if (dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED && src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED &&
- /* check if everything fits into the bitfields */
- src_pitch <= (1 << 14) && dst_pitch <= (1 << 14) && src_slice_pitch <= (1 << 28) &&
- dst_slice_pitch <= (1 << 28) && copy_width <= (1 << 14) && copy_height <= (1 << 14) &&
- copy_depth <= (1 << 11) &&
- /* HW limitation - GFX7: */
- (sctx->chip_class != GFX7 ||
- (copy_width < (1 << 14) && copy_height < (1 << 14) && copy_depth < (1 << 11))) &&
- /* HW limitation - some GFX7 parts: */
- ((sctx->family != CHIP_BONAIRE && sctx->family != CHIP_KAVERI) ||
- (srcx + copy_width != (1 << 14) && srcy + copy_height != (1 << 14)))) {
- struct radeon_cmdbuf *cs = &sctx->sdma_cs;
-
- si_need_dma_space(sctx, 13, &sdst->buffer, &ssrc->buffer);
-
- radeon_emit(
- cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW, 0) |
- (util_logbase2(bpp) << 29));
- radeon_emit(cs, src_address);
- radeon_emit(cs, src_address >> 32);
- radeon_emit(cs, srcx | (srcy << 16));
- radeon_emit(cs, srcz | ((src_pitch - 1) << 16));
- radeon_emit(cs, src_slice_pitch - 1);
- radeon_emit(cs, dst_address);
- radeon_emit(cs, dst_address >> 32);
- radeon_emit(cs, dstx | (dsty << 16));
- radeon_emit(cs, dstz | ((dst_pitch - 1) << 16));
- radeon_emit(cs, dst_slice_pitch - 1);
- if (sctx->chip_class == GFX7) {
- radeon_emit(cs, copy_width | (copy_height << 16));
- radeon_emit(cs, copy_depth);
- } else {
- radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16));
- radeon_emit(cs, (copy_depth - 1));
- }
- return true;
- }
-
- /* Tiled <-> linear sub-window copy. */
- if ((src_mode >= RADEON_SURF_MODE_1D) != (dst_mode >= RADEON_SURF_MODE_1D)) {
- struct si_texture *tiled = src_mode >= RADEON_SURF_MODE_1D ? ssrc : sdst;
- struct si_texture *linear = tiled == ssrc ? sdst : ssrc;
- unsigned tiled_level = tiled == ssrc ? src_level : dst_level;
- unsigned linear_level = linear == ssrc ? src_level : dst_level;
- unsigned tiled_x = tiled == ssrc ? srcx : dstx;
- unsigned linear_x = linear == ssrc ? srcx : dstx;
- unsigned tiled_y = tiled == ssrc ? srcy : dsty;
- unsigned linear_y = linear == ssrc ? srcy : dsty;
- unsigned tiled_z = tiled == ssrc ? srcz : dstz;
- unsigned linear_z = linear == ssrc ? srcz : dstz;
- unsigned tiled_width = tiled == ssrc ? src_width : dst_width;
- unsigned linear_width = linear == ssrc ? src_width : dst_width;
- unsigned tiled_pitch = tiled == ssrc ? src_pitch : dst_pitch;
- unsigned linear_pitch = linear == ssrc ? src_pitch : dst_pitch;
- unsigned tiled_slice_pitch = tiled == ssrc ? src_slice_pitch : dst_slice_pitch;
- unsigned linear_slice_pitch = linear == ssrc ? src_slice_pitch : dst_slice_pitch;
- uint64_t tiled_address = tiled == ssrc ? src_address : dst_address;
- uint64_t linear_address = linear == ssrc ? src_address : dst_address;
- unsigned tiled_micro_mode = tiled == ssrc ? src_micro_mode : dst_micro_mode;
-
- assert(tiled_pitch % 8 == 0);
- assert(tiled_slice_pitch % 64 == 0);
- unsigned pitch_tile_max = tiled_pitch / 8 - 1;
- unsigned slice_tile_max = tiled_slice_pitch / 64 - 1;
- unsigned xalign = MAX2(1, 4 / bpp);
- unsigned copy_width_aligned = copy_width;
-
- /* If the region ends at the last pixel and is unaligned, we
- * can copy the remainder of the line that is not visible to
- * make it aligned.
- */
- if (copy_width % xalign != 0 && linear_x + copy_width == linear_width &&
- tiled_x + copy_width == tiled_width &&
- linear_x + align(copy_width, xalign) <= linear_pitch &&
- tiled_x + align(copy_width, xalign) <= tiled_pitch)
- copy_width_aligned = align(copy_width, xalign);
-
- /* HW limitations. */
- if ((sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KAVERI) &&
- linear_pitch - 1 == 0x3fff && bpp == 16)
- return false;
-
- if (sctx->chip_class == GFX7 &&
- (copy_width_aligned == (1 << 14) || copy_height == (1 << 14) || copy_depth == (1 << 11)))
- return false;
-
- if ((sctx->family == CHIP_BONAIRE || sctx->family == CHIP_KAVERI ||
- sctx->family == CHIP_KABINI) &&
- (tiled_x + copy_width == (1 << 14) || tiled_y + copy_height == (1 << 14)))
- return false;
-
- /* The hw can read outside of the given linear buffer bounds,
- * or access those pages but not touch the memory in case
- * of writes. (it still causes a VM fault)
- *
- * Out-of-bounds memory access or page directory access must
- * be prevented.
- */
- int64_t start_linear_address, end_linear_address;
- unsigned granularity;
-
- /* Deduce the size of reads from the linear surface. */
- switch (tiled_micro_mode) {
- case V_009910_ADDR_SURF_DISPLAY_MICRO_TILING:
- granularity = bpp == 1 ? 64 / (8 * bpp) : 128 / (8 * bpp);
- break;
- case V_009910_ADDR_SURF_THIN_MICRO_TILING:
- case V_009910_ADDR_SURF_DEPTH_MICRO_TILING:
- if (0 /* TODO: THICK microtiling */)
- granularity =
- bpp == 1 ? 32 / (8 * bpp)
- : bpp == 2 ? 64 / (8 * bpp) : bpp <= 8 ? 128 / (8 * bpp) : 256 / (8 * bpp);
- else
- granularity = bpp <= 2 ? 64 / (8 * bpp) : bpp <= 8 ? 128 / (8 * bpp) : 256 / (8 * bpp);
- break;
- default:
- return false;
- }
-
- /* The linear reads start at tiled_x & ~(granularity - 1).
- * If linear_x == 0 && tiled_x % granularity != 0, the hw
- * starts reading from an address preceding linear_address!!!
- */
- start_linear_address =
- linear->surface.u.legacy.level[linear_level].offset +
- bpp * (linear_z * linear_slice_pitch + linear_y * linear_pitch + linear_x);
- start_linear_address -= (int)(bpp * (tiled_x % granularity));
-
- end_linear_address =
- linear->surface.u.legacy.level[linear_level].offset +
- bpp * ((linear_z + copy_depth - 1) * linear_slice_pitch +
- (linear_y + copy_height - 1) * linear_pitch + (linear_x + copy_width));
-
- if ((tiled_x + copy_width) % granularity)
- end_linear_address += granularity - (tiled_x + copy_width) % granularity;
-
- if (start_linear_address < 0 || end_linear_address > linear->surface.surf_size)
- return false;
-
- /* Check requirements. */
- if (tiled_address % 256 == 0 && linear_address % 4 == 0 && linear_pitch % xalign == 0 &&
- linear_x % xalign == 0 && tiled_x % xalign == 0 && copy_width_aligned % xalign == 0 &&
- tiled_micro_mode != V_009910_ADDR_SURF_ROTATED_MICRO_TILING &&
- /* check if everything fits into the bitfields */
- tiled->surface.u.legacy.tile_split <= 4096 && pitch_tile_max < (1 << 11) &&
- slice_tile_max < (1 << 22) && linear_pitch <= (1 << 14) &&
- linear_slice_pitch <= (1 << 28) && copy_width_aligned <= (1 << 14) &&
- copy_height <= (1 << 14) && copy_depth <= (1 << 11)) {
- struct radeon_cmdbuf *cs = &sctx->sdma_cs;
- uint32_t direction = linear == sdst ? 1u << 31 : 0;
-
- si_need_dma_space(sctx, 14, &sdst->buffer, &ssrc->buffer);
-
- radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
- CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW, 0) |
- direction);
- radeon_emit(cs, tiled_address);
- radeon_emit(cs, tiled_address >> 32);
- radeon_emit(cs, tiled_x | (tiled_y << 16));
- radeon_emit(cs, tiled_z | (pitch_tile_max << 16));
- radeon_emit(cs, slice_tile_max);
- radeon_emit(cs, encode_tile_info(sctx, tiled, tiled_level, true));
- radeon_emit(cs, linear_address);
- radeon_emit(cs, linear_address >> 32);
- radeon_emit(cs, linear_x | (linear_y << 16));
- radeon_emit(cs, linear_z | ((linear_pitch - 1) << 16));
- radeon_emit(cs, linear_slice_pitch - 1);
- if (sctx->chip_class == GFX7) {
- radeon_emit(cs, copy_width_aligned | (copy_height << 16));
- radeon_emit(cs, copy_depth);
- } else {
- radeon_emit(cs, (copy_width_aligned - 1) | ((copy_height - 1) << 16));
- radeon_emit(cs, (copy_depth - 1));
- }
- return true;
- }
- }
-
- /* Tiled -> Tiled sub-window copy. */
- if (dst_mode >= RADEON_SURF_MODE_1D && src_mode >= RADEON_SURF_MODE_1D &&
- /* check if these fit into the bitfields */
- src_address % 256 == 0 && dst_address % 256 == 0 &&
- ssrc->surface.u.legacy.tile_split <= 4096 && sdst->surface.u.legacy.tile_split <= 4096 &&
- dstx % 8 == 0 && dsty % 8 == 0 && srcx % 8 == 0 && srcy % 8 == 0 &&
- /* this can either be equal, or display->rotated (GFX8+ only) */
- (src_micro_mode == dst_micro_mode ||
- (sctx->chip_class >= GFX8 && src_micro_mode == V_009910_ADDR_SURF_DISPLAY_MICRO_TILING &&
- dst_micro_mode == V_009910_ADDR_SURF_ROTATED_MICRO_TILING))) {
- assert(src_pitch % 8 == 0);
- assert(dst_pitch % 8 == 0);
- assert(src_slice_pitch % 64 == 0);
- assert(dst_slice_pitch % 64 == 0);
- unsigned src_pitch_tile_max = src_pitch / 8 - 1;
- unsigned dst_pitch_tile_max = dst_pitch / 8 - 1;
- unsigned src_slice_tile_max = src_slice_pitch / 64 - 1;
- unsigned dst_slice_tile_max = dst_slice_pitch / 64 - 1;
- unsigned copy_width_aligned = copy_width;
- unsigned copy_height_aligned = copy_height;
-
- /* If the region ends at the last pixel and is unaligned, we
- * can copy the remainder of the tile that is not visible to
- * make it aligned.
- */
- if (copy_width % 8 != 0 && srcx + copy_width == src_width && dstx + copy_width == dst_width)
- copy_width_aligned = align(copy_width, 8);
-
- if (copy_height % 8 != 0 && srcy + copy_height == src_height &&
- dsty + copy_height == dst_height)
- copy_height_aligned = align(copy_height, 8);
-
- /* check if these fit into the bitfields */
- if (src_pitch_tile_max < (1 << 11) && dst_pitch_tile_max < (1 << 11) &&
- src_slice_tile_max < (1 << 22) && dst_slice_tile_max < (1 << 22) &&
- copy_width_aligned <= (1 << 14) && copy_height_aligned <= (1 << 14) &&
- copy_depth <= (1 << 11) && copy_width_aligned % 8 == 0 && copy_height_aligned % 8 == 0 &&
- /* HW limitation - GFX7: */
- (sctx->chip_class != GFX7 ||
- (copy_width_aligned < (1 << 14) && copy_height_aligned < (1 << 14) &&
- copy_depth < (1 << 11))) &&
- /* HW limitation - some GFX7 parts: */
- ((sctx->family != CHIP_BONAIRE && sctx->family != CHIP_KAVERI &&
- sctx->family != CHIP_KABINI) ||
- (srcx + copy_width_aligned != (1 << 14) && srcy + copy_height_aligned != (1 << 14) &&
- dstx + copy_width != (1 << 14)))) {
- struct radeon_cmdbuf *cs = &sctx->sdma_cs;
-
- si_need_dma_space(sctx, 15, &sdst->buffer, &ssrc->buffer);
-
- radeon_emit(
- cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_T2T_SUB_WINDOW, 0));
- radeon_emit(cs, src_address);
- radeon_emit(cs, src_address >> 32);
- radeon_emit(cs, srcx | (srcy << 16));
- radeon_emit(cs, srcz | (src_pitch_tile_max << 16));
- radeon_emit(cs, src_slice_tile_max);
- radeon_emit(cs, encode_tile_info(sctx, ssrc, src_level, true));
- radeon_emit(cs, dst_address);
- radeon_emit(cs, dst_address >> 32);
- radeon_emit(cs, dstx | (dsty << 16));
- radeon_emit(cs, dstz | (dst_pitch_tile_max << 16));
- radeon_emit(cs, dst_slice_tile_max);
- radeon_emit(cs, encode_tile_info(sctx, sdst, dst_level, false));
- if (sctx->chip_class == GFX7) {
- radeon_emit(cs, copy_width_aligned | (copy_height_aligned << 16));
- radeon_emit(cs, copy_depth);
- } else {
- radeon_emit(cs, (copy_width_aligned - 8) | ((copy_height_aligned - 8) << 16));
- radeon_emit(cs, (copy_depth - 1));
- }
- return true;
- }
- }
-
- return false;
-}
-
-static void cik_sdma_copy(struct pipe_context *ctx, struct pipe_resource *dst, unsigned dst_level,
- unsigned dstx, unsigned dsty, unsigned dstz, struct pipe_resource *src,
- unsigned src_level, const struct pipe_box *src_box)
-{
- struct si_context *sctx = (struct si_context *)ctx;
-
- assert(src->target != PIPE_BUFFER);
-
- if (!sctx->sdma_cs.priv || src->flags & PIPE_RESOURCE_FLAG_SPARSE ||
- dst->flags & PIPE_RESOURCE_FLAG_SPARSE)
- goto fallback;
-
- /* SDMA causes corruption. See:
- * https://bugs.freedesktop.org/show_bug.cgi?id=110575
- * https://bugs.freedesktop.org/show_bug.cgi?id=110635
- *
- * Keep SDMA enabled on APUs.
- */
- if (sctx->screen->debug_flags & DBG(FORCE_SDMA) ||
- (!sctx->screen->info.has_dedicated_vram &&
- !(sctx->screen->debug_flags & DBG(NO_SDMA_COPY_IMAGE)))) {
- if ((sctx->chip_class == GFX7 || sctx->chip_class == GFX8) &&
- cik_sdma_copy_texture(sctx, dst, dst_level, dstx, dsty, dstz, src, src_level, src_box))
- return;
- else if (sctx->chip_class == GFX9 && si_sdma_v4_copy_texture(sctx, dst, dst_level, dstx, dsty,
- dstz, src, src_level, src_box))
- return;
- }
-
-fallback:
- si_resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz, src, src_level, src_box);
-}
-
-void cik_init_sdma_functions(struct si_context *sctx)
-{
- sctx->dma_copy = cik_sdma_copy;
-}
# SOFTWARE.
files_libradeonsi = files(
- 'cik_sdma.c',
'driinfo_radeonsi.h',
'gfx10_query.c',
'gfx10_shader_ngg.c',
'si_cp_reg_shadowing.c',
'si_debug.c',
'si_descriptors.c',
- 'si_dma_cs.c',
'si_fence.c',
'si_get.c',
'si_gfx_cs.c',
/* SNORM8 blitting has precision issues on some chips. Use the SINT
* equivalent instead, which doesn't force DCC decompression.
- * Note that some chips avoid this issue by using SDMA.
*/
if (util_format_is_snorm8(dst_templ.format)) {
dst_templ.format = src_templ.format = util_format_snorm8_to_sint8(dst_templ.format);
static void si_blit(struct pipe_context *ctx, const struct pipe_blit_info *info)
{
struct si_context *sctx = (struct si_context *)ctx;
- struct si_texture *dst = (struct si_texture *)info->dst.resource;
if (do_hardware_msaa_resolve(ctx, info)) {
return;
}
- /* Using SDMA for copying to a linear texture in GTT is much faster.
- * This improves DRI PRIME performance.
- *
- * resource_copy_region can't do this yet, because dma_copy calls it
- * on failure (recursion).
+ /* Using compute for copying to a linear texture in GTT is much faster than
+ * going through RBs (render backends). This improves DRI PRIME performance.
*/
- if (dst->surface.is_linear && util_can_blit_via_copy_region(info, false)) {
- sctx->dma_copy(ctx, info->dst.resource, info->dst.level, info->dst.box.x, info->dst.box.y,
- info->dst.box.z, info->src.resource, info->src.level, &info->src.box);
+ if (util_can_blit_via_copy_region(info, false)) {
+ si_resource_copy_region(ctx, info->dst.resource, info->dst.level,
+ info->dst.box.x, info->dst.box.y, info->dst.box.z,
+ info->src.resource, info->src.level, &info->src.box);
return;
}
si_decompress_subresource(ctx, info->src.resource, PIPE_MASK_RGBAZS, info->src.level,
info->src.box.z, info->src.box.z + info->src.box.depth - 1);
- if (sctx->screen->debug_flags & DBG(FORCE_SDMA) && util_try_blit_via_copy_region(ctx, info))
- return;
-
si_blitter_begin(sctx, SI_BLIT | (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
util_blitter_blit(sctx->blitter, info);
si_blitter_end(sctx);
bool si_cs_is_buffer_referenced(struct si_context *sctx, struct pb_buffer *buf,
enum radeon_bo_usage usage)
{
- if (sctx->ws->cs_is_buffer_referenced(&sctx->gfx_cs, buf, usage)) {
- return true;
- }
- if (radeon_emitted(&sctx->sdma_cs, 0) &&
- sctx->ws->cs_is_buffer_referenced(&sctx->sdma_cs, buf, usage)) {
- return true;
- }
- return false;
+ return sctx->ws->cs_is_buffer_referenced(&sctx->gfx_cs, buf, usage);
}
void *si_buffer_map(struct si_context *sctx, struct si_resource *resource,
unsigned usage)
{
- enum radeon_bo_usage rusage = RADEON_USAGE_READWRITE;
- bool busy = false;
-
- assert(!(resource->flags & RADEON_FLAG_SPARSE));
-
- if (usage & PIPE_MAP_UNSYNCHRONIZED) {
- return sctx->ws->buffer_map(resource->buf, NULL, usage);
- }
-
- if (!(usage & PIPE_MAP_WRITE)) {
- /* have to wait for the last write */
- rusage = RADEON_USAGE_WRITE;
- }
-
- if (radeon_emitted(&sctx->gfx_cs, sctx->initial_gfx_cs_size) &&
- sctx->ws->cs_is_buffer_referenced(&sctx->gfx_cs, resource->buf, rusage)) {
- if (usage & PIPE_MAP_DONTBLOCK) {
- si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
- return NULL;
- } else {
- si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
- busy = true;
- }
- }
- if (radeon_emitted(&sctx->sdma_cs, 0) &&
- sctx->ws->cs_is_buffer_referenced(&sctx->sdma_cs, resource->buf, rusage)) {
- if (usage & PIPE_MAP_DONTBLOCK) {
- si_flush_dma_cs(sctx, PIPE_FLUSH_ASYNC, NULL);
- return NULL;
- } else {
- si_flush_dma_cs(sctx, 0, NULL);
- busy = true;
- }
- }
-
- if (busy || !sctx->ws->buffer_wait(resource->buf, 0, rusage)) {
- if (usage & PIPE_MAP_DONTBLOCK) {
- return NULL;
- } else {
- /* We will be wait for the GPU. Wait for any offloaded
- * CS flush to complete to avoid busy-waiting in the winsys. */
- sctx->ws->cs_sync_flush(&sctx->gfx_cs);
- if (sctx->sdma_cs.priv)
- sctx->ws->cs_sync_flush(&sctx->sdma_cs);
- }
- }
-
- /* Setting the CS to NULL will prevent doing checks we have done already. */
- return sctx->ws->buffer_map(resource->buf, NULL, usage);
+ return sctx->ws->buffer_map(resource->buf, &sctx->gfx_cs, usage);
}
void si_init_resource_fields(struct si_screen *sscreen, struct si_resource *res, uint64_t size,
res->flags |= RADEON_FLAG_DRIVER_INTERNAL;
/* For higher throughput and lower latency over PCIe assuming sequential access.
- * Only CP DMA, SDMA, and optimized compute benefit from this.
+ * Only CP DMA and optimized compute benefit from this.
* GFX8 and older don't support RADEON_FLAG_UNCACHED.
*/
if (sscreen->info.chip_class >= GFX9 &&
}
}
- if (usage & PIPE_MAP_FLUSH_EXPLICIT &&
- buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) {
- usage &= ~(PIPE_MAP_UNSYNCHRONIZED | PIPE_MAP_PERSISTENT);
- usage |= PIPE_MAP_DISCARD_RANGE;
- force_discard_range = true;
- }
-
if (usage & PIPE_MAP_DISCARD_RANGE &&
((!(usage & (PIPE_MAP_UNSYNCHRONIZED | PIPE_MAP_PERSISTENT))) ||
(buf->flags & RADEON_FLAG_SPARSE))) {
box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT), 256);
if (staging) {
/* Copy the VRAM buffer to the staging buffer. */
- si_sdma_copy_buffer(sctx, &staging->b.b, resource, box->x % SI_MAP_BUFFER_ALIGNMENT,
- box->x, box->width);
+ si_copy_buffer(sctx, &staging->b.b, resource, box->x % SI_MAP_BUFFER_ALIGNMENT,
+ box->x, box->width);
data = si_buffer_map(sctx, staging, usage & ~PIPE_MAP_UNSYNCHRONIZED);
if (!data) {
unsigned src_offset =
stransfer->offset + transfer->box.x % SI_MAP_BUFFER_ALIGNMENT + (box->x - transfer->box.x);
- if (buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) {
- /* This should be true for all uploaders. */
- assert(transfer->box.x == 0);
-
- /* Find a previous upload and extend its range. The last
- * upload is likely to be at the end of the list.
- */
- for (int i = sctx->num_sdma_uploads - 1; i >= 0; i--) {
- struct si_sdma_upload *up = &sctx->sdma_uploads[i];
-
- if (up->dst != buf)
- continue;
-
- assert(up->src == stransfer->staging);
- assert(box->x > up->dst_offset);
- up->size = box->x + box->width - up->dst_offset;
- return;
- }
-
- /* Enlarge the array if it's full. */
- if (sctx->num_sdma_uploads == sctx->max_sdma_uploads) {
- unsigned size;
-
- sctx->max_sdma_uploads += 4;
- size = sctx->max_sdma_uploads * sizeof(sctx->sdma_uploads[0]);
- sctx->sdma_uploads = realloc(sctx->sdma_uploads, size);
- }
-
- /* Add a new upload. */
- struct si_sdma_upload *up = &sctx->sdma_uploads[sctx->num_sdma_uploads++];
- up->dst = up->src = NULL;
- si_resource_reference(&up->dst, buf);
- si_resource_reference(&up->src, stransfer->staging);
- up->dst_offset = box->x;
- up->src_offset = src_offset;
- up->size = box->width;
- return;
- }
-
/* Copy the staging buffer into the original one. */
si_copy_buffer(sctx, transfer->resource, &stransfer->staging->b.b, box->x, src_offset,
box->width);
ctx->ws->cs_is_buffer_referenced(&ctx->gfx_cs, res->buf, RADEON_USAGE_READWRITE)) {
si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
}
- if (radeon_emitted(&ctx->sdma_cs, 0) &&
- ctx->ws->cs_is_buffer_referenced(&ctx->sdma_cs, res->buf, RADEON_USAGE_READWRITE)) {
- si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);
- }
-
- if (ctx->sdma_cs.priv)
- ctx->ws->cs_sync_flush(&ctx->sdma_cs);
ctx->ws->cs_sync_flush(&ctx->gfx_cs);
assert(resource->target == PIPE_BUFFER);
/* SNORM8 blitting has precision issues on some chips. Use the SINT
* equivalent instead, which doesn't force DCC decompression.
- * Note that some chips avoid this issue by using SDMA.
*/
if (util_format_is_snorm8(dst->format)) {
image[0].format = image[1].format = util_format_snorm8_to_sint8(dst->format);
si_dump_compute_descriptors(sctx, log);
}
-static void si_dump_dma(struct si_context *sctx, struct radeon_saved_cs *saved, FILE *f)
-{
- static const char ib_name[] = "sDMA IB";
- unsigned i;
-
- si_dump_bo_list(sctx, saved, f);
-
- fprintf(f, "------------------ %s begin ------------------\n", ib_name);
-
- for (i = 0; i < saved->num_dw; ++i) {
- fprintf(f, " %08x\n", saved->ib[i]);
- }
-
- fprintf(f, "------------------- %s end -------------------\n", ib_name);
- fprintf(f, "\n");
-
- fprintf(f, "SDMA Dump Done.\n");
-}
-
void si_check_vm_faults(struct si_context *sctx, struct radeon_saved_cs *saved, enum ring_type ring)
{
struct pipe_screen *screen = sctx->b.screen;
u_log_context_destroy(&log);
break;
}
- case RING_DMA:
- si_dump_dma(sctx, saved, f);
- break;
default:
break;
+++ /dev/null
-/*
- * Copyright 2018 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include "si_pipe.h"
-#include "sid.h"
-
-static void si_dma_emit_wait_idle(struct si_context *sctx)
-{
- struct radeon_cmdbuf *cs = &sctx->sdma_cs;
-
- /* NOP waits for idle. */
- if (sctx->chip_class >= GFX7)
- radeon_emit(cs, 0x00000000); /* NOP */
- else
- radeon_emit(cs, 0xf0000000); /* NOP */
-}
-
-void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst, uint64_t offset)
-{
- struct radeon_cmdbuf *cs = &sctx->sdma_cs;
- uint64_t va = dst->gpu_address + offset;
-
- if (sctx->chip_class == GFX6) {
- unreachable("SI DMA doesn't support the timestamp packet.");
- return;
- }
-
- /* Mark the buffer range of destination as valid (initialized),
- * so that transfer_map knows it should wait for the GPU when mapping
- * that range. */
- util_range_add(&dst->b.b, &dst->valid_buffer_range, offset, offset + 8);
-
- assert(va % 8 == 0);
-
- si_need_dma_space(sctx, 4, dst, NULL);
- si_dma_emit_wait_idle(sctx);
-
- radeon_emit(
- cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_TIMESTAMP, SDMA_TS_SUB_OPCODE_GET_GLOBAL_TIMESTAMP, 0));
- radeon_emit(cs, va);
- radeon_emit(cs, va >> 32);
-}
-
-void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset,
- uint64_t size, unsigned clear_value)
-{
- struct radeon_cmdbuf *cs = &sctx->sdma_cs;
- unsigned i, ncopy, csize;
- struct si_resource *sdst = si_resource(dst);
-
- assert(offset % 4 == 0);
- assert(size);
- assert(size % 4 == 0);
-
- if (!cs->priv || dst->flags & PIPE_RESOURCE_FLAG_SPARSE ||
- sctx->screen->debug_flags & DBG(NO_SDMA_CLEARS) ||
- unlikely(radeon_uses_secure_bos(sctx->ws))) {
- sctx->b.clear_buffer(&sctx->b, dst, offset, size, &clear_value, 4);
- return;
- }
-
- /* Mark the buffer range of destination as valid (initialized),
- * so that transfer_map knows it should wait for the GPU when mapping
- * that range. */
- util_range_add(dst, &sdst->valid_buffer_range, offset, offset + size);
-
- offset += sdst->gpu_address;
-
- if (sctx->chip_class == GFX6) {
- /* the same maximum size as for copying */
- ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
- si_need_dma_space(sctx, ncopy * 4, sdst, NULL);
-
- for (i = 0; i < ncopy; i++) {
- csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE);
- radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0, csize / 4));
- radeon_emit(cs, offset);
- radeon_emit(cs, clear_value);
- radeon_emit(cs, (offset >> 32) << 16);
- offset += csize;
- size -= csize;
- }
- return;
- }
-
- /* The following code is for CI and later. */
- /* the same maximum size as for copying */
- unsigned max_size_per_packet = sctx->chip_class >= GFX10_3 ?
- GFX103_SDMA_COPY_MAX_SIZE :
- CIK_SDMA_COPY_MAX_SIZE;
- ncopy = DIV_ROUND_UP(size, max_size_per_packet);
- si_need_dma_space(sctx, ncopy * 5, sdst, NULL);
-
- for (i = 0; i < ncopy; i++) {
- csize = MIN2(size, max_size_per_packet);
- radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0, 0x8000 /* dword copy */));
- radeon_emit(cs, offset);
- radeon_emit(cs, offset >> 32);
- radeon_emit(cs, clear_value);
- /* dw count */
- radeon_emit(cs, (sctx->chip_class >= GFX9 ? csize - 1 : csize) & 0xfffffffc);
- offset += csize;
- size -= csize;
- }
-}
-
-void si_sdma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
- struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset,
- uint64_t size)
-{
- struct radeon_cmdbuf *cs = &sctx->sdma_cs;
- unsigned i, ncopy, csize;
- struct si_resource *sdst = si_resource(dst);
- struct si_resource *ssrc = si_resource(src);
-
- if (!cs->priv || dst->flags & PIPE_RESOURCE_FLAG_SPARSE || src->flags & PIPE_RESOURCE_FLAG_SPARSE ||
- (ssrc->flags & RADEON_FLAG_ENCRYPTED) != (sdst->flags & RADEON_FLAG_ENCRYPTED)) {
- si_copy_buffer(sctx, dst, src, dst_offset, src_offset, size);
- return;
- }
-
- /* Mark the buffer range of destination as valid (initialized),
- * so that transfer_map knows it should wait for the GPU when mapping
- * that range. */
- util_range_add(dst, &sdst->valid_buffer_range, dst_offset, dst_offset + size);
-
- dst_offset += sdst->gpu_address;
- src_offset += ssrc->gpu_address;
-
- if (sctx->chip_class == GFX6) {
- unsigned max_size, sub_cmd, shift;
-
- /* see whether we should use the dword-aligned or byte-aligned copy */
- if (!(dst_offset % 4) && !(src_offset % 4) && !(size % 4)) {
- sub_cmd = SI_DMA_COPY_DWORD_ALIGNED;
- shift = 2;
- max_size = SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE;
- } else {
- sub_cmd = SI_DMA_COPY_BYTE_ALIGNED;
- shift = 0;
- max_size = SI_DMA_COPY_MAX_BYTE_ALIGNED_SIZE;
- }
-
- ncopy = DIV_ROUND_UP(size, max_size);
- si_need_dma_space(sctx, ncopy * 5, sdst, ssrc);
-
- for (i = 0; i < ncopy; i++) {
- csize = MIN2(size, max_size);
- radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, csize >> shift));
- radeon_emit(cs, dst_offset);
- radeon_emit(cs, src_offset);
- radeon_emit(cs, (dst_offset >> 32UL) & 0xff);
- radeon_emit(cs, (src_offset >> 32UL) & 0xff);
- dst_offset += csize;
- src_offset += csize;
- size -= csize;
- }
- return;
- }
-
- /* The following code is for CI and later. */
- unsigned max_size_per_packet = sctx->chip_class >= GFX10_3 ?
- GFX103_SDMA_COPY_MAX_SIZE :
- CIK_SDMA_COPY_MAX_SIZE;
- unsigned align = ~0u;
- ncopy = DIV_ROUND_UP(size, max_size_per_packet);
-
- /* Align copy size to dw if src/dst address are dw aligned */
- if ((src_offset & 0x3) == 0 && (dst_offset & 0x3) == 0 && size > 4 && (size & 3) != 0) {
- align = ~0x3u;
- ncopy++;
- }
-
- si_need_dma_space(sctx, ncopy * 7, sdst, ssrc);
-
- for (i = 0; i < ncopy; i++) {
- csize = size >= 4 ? MIN2(size & align, max_size_per_packet) : size;
- radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY, CIK_SDMA_COPY_SUB_OPCODE_LINEAR,
- (sctx->ws->cs_is_secure(cs) ? 1u : 0) << 2));
- radeon_emit(cs, sctx->chip_class >= GFX9 ? csize - 1 : csize);
- radeon_emit(cs, 0); /* src/dst endian swap */
- radeon_emit(cs, src_offset);
- radeon_emit(cs, src_offset >> 32);
- radeon_emit(cs, dst_offset);
- radeon_emit(cs, dst_offset >> 32);
- dst_offset += csize;
- src_offset += csize;
- size -= csize;
- }
-}
-
-void si_need_dma_space(struct si_context *ctx, unsigned num_dw, struct si_resource *dst,
- struct si_resource *src)
-{
- struct radeon_winsys *ws = ctx->ws;
- uint64_t vram = ctx->sdma_cs.used_vram;
- uint64_t gtt = ctx->sdma_cs.used_gart;
-
- if (dst) {
- vram += dst->vram_usage;
- gtt += dst->gart_usage;
- }
- if (src) {
- vram += src->vram_usage;
- gtt += src->gart_usage;
- }
-
- /* Flush the GFX IB if DMA depends on it. */
- if (!ctx->sdma_uploads_in_progress && radeon_emitted(&ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
- ((dst && ws->cs_is_buffer_referenced(&ctx->gfx_cs, dst->buf, RADEON_USAGE_READWRITE)) ||
- (src && ws->cs_is_buffer_referenced(&ctx->gfx_cs, src->buf, RADEON_USAGE_WRITE))))
- si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
-
- bool use_secure_cmd = false;
- if (unlikely(radeon_uses_secure_bos(ctx->ws))) {
- if (src && src->flags & RADEON_FLAG_ENCRYPTED) {
- assert(!dst || (dst->flags & RADEON_FLAG_ENCRYPTED));
- use_secure_cmd = true;
- } else if (dst && (dst->flags & RADEON_FLAG_ENCRYPTED)) {
- use_secure_cmd = true;
- }
- }
-
- /* Flush if there's not enough space, or if the memory usage per IB
- * is too large.
- *
- * IBs using too little memory are limited by the IB submission overhead.
- * IBs using too much memory are limited by the kernel/TTM overhead.
- * Too long IBs create CPU-GPU pipeline bubbles and add latency.
- *
- * This heuristic makes sure that DMA requests are executed
- * very soon after the call is made and lowers memory usage.
- * It improves texture upload performance by keeping the DMA
- * engine busy while uploads are being submitted.
- */
- num_dw++; /* for emit_wait_idle below */
- if (!ctx->sdma_uploads_in_progress &&
- (use_secure_cmd != ctx->ws->cs_is_secure(&ctx->sdma_cs) ||
- !ws->cs_check_space(&ctx->sdma_cs, num_dw, false) ||
- ctx->sdma_cs.used_vram + ctx->sdma_cs.used_gart > 64 * 1024 * 1024 ||
- !radeon_cs_memory_below_limit(ctx->screen, &ctx->sdma_cs, vram, gtt))) {
- si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC | RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION, NULL);
- assert(ctx->ws->cs_is_secure(&ctx->sdma_cs) == use_secure_cmd);
- assert((num_dw + ctx->sdma_cs.current.cdw) <= ctx->sdma_cs.current.max_dw);
- }
-
- /* Wait for idle if either buffer has been used in the IB before to
- * prevent read-after-write hazards.
- */
- if ((dst && ws->cs_is_buffer_referenced(&ctx->sdma_cs, dst->buf, RADEON_USAGE_READWRITE)) ||
- (src && ws->cs_is_buffer_referenced(&ctx->sdma_cs, src->buf, RADEON_USAGE_WRITE)))
- si_dma_emit_wait_idle(ctx);
-
- unsigned sync = ctx->sdma_uploads_in_progress ? 0 : RADEON_USAGE_SYNCHRONIZED;
- if (dst) {
- ws->cs_add_buffer(&ctx->sdma_cs, dst->buf, RADEON_USAGE_WRITE | sync, dst->domains, 0);
- }
- if (src) {
- ws->cs_add_buffer(&ctx->sdma_cs, src->buf, RADEON_USAGE_READ | sync, src->domains, 0);
- }
-
- /* this function is called before all DMA calls, so increment this. */
- ctx->num_dma_calls++;
-}
-
-void si_flush_dma_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence)
-{
- struct radeon_cmdbuf *cs = &ctx->sdma_cs;
- struct radeon_saved_cs saved;
- bool check_vm = (ctx->screen->debug_flags & DBG(CHECK_VM)) != 0;
-
- if (!radeon_emitted(cs, 0) &&
- !(flags & RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION)) {
- if (fence)
- ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
- return;
- }
-
- if (check_vm)
- si_save_cs(ctx->ws, cs, &saved, true);
-
- if (ctx->is_noop)
- flags |= RADEON_FLUSH_NOOP;
-
- ctx->ws->cs_flush(cs, flags, &ctx->last_sdma_fence);
- if (fence)
- ctx->ws->fence_reference(fence, ctx->last_sdma_fence);
-
- if (check_vm) {
- /* Use conservative timeout 800ms, after which we won't wait any
- * longer and assume the GPU is hung.
- */
- ctx->ws->fence_wait(ctx->ws, ctx->last_sdma_fence, 800 * 1000 * 1000);
-
- si_check_vm_faults(ctx, &saved, RING_DMA);
- si_clear_saved_cs(&saved);
- }
-}
unsigned offset;
};
-struct si_multi_fence {
+struct si_fence {
struct pipe_reference reference;
struct pipe_fence_handle *gfx;
- struct pipe_fence_handle *sdma;
struct tc_unflushed_batch_token *tc_token;
struct util_queue_fence ready;
{
struct radeon_winsys *ws = sctx->ws;
- if (sctx->sdma_cs.priv)
- ws->cs_add_fence_dependency(&sctx->sdma_cs, fence, 0);
ws->cs_add_fence_dependency(&sctx->gfx_cs, fence, 0);
}
struct pipe_fence_handle *src)
{
struct radeon_winsys *ws = ((struct si_screen *)screen)->ws;
- struct si_multi_fence **sdst = (struct si_multi_fence **)dst;
- struct si_multi_fence *ssrc = (struct si_multi_fence *)src;
+ struct si_fence **sdst = (struct si_fence **)dst;
+ struct si_fence *ssrc = (struct si_fence *)src;
if (pipe_reference(&(*sdst)->reference, &ssrc->reference)) {
ws->fence_reference(&(*sdst)->gfx, NULL);
- ws->fence_reference(&(*sdst)->sdma, NULL);
tc_unflushed_batch_token_reference(&(*sdst)->tc_token, NULL);
si_resource_reference(&(*sdst)->fine.buf, NULL);
FREE(*sdst);
*sdst = ssrc;
}
-static struct si_multi_fence *si_create_multi_fence()
+static struct si_fence *si_create_multi_fence()
{
- struct si_multi_fence *fence = CALLOC_STRUCT(si_multi_fence);
+ struct si_fence *fence = CALLOC_STRUCT(si_fence);
if (!fence)
return NULL;
struct pipe_fence_handle *si_create_fence(struct pipe_context *ctx,
struct tc_unflushed_batch_token *tc_token)
{
- struct si_multi_fence *fence = si_create_multi_fence();
+ struct si_fence *fence = si_create_multi_fence();
if (!fence)
return NULL;
struct pipe_fence_handle *fence, uint64_t timeout)
{
struct radeon_winsys *rws = ((struct si_screen *)screen)->ws;
- struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
+ struct si_fence *sfence = (struct si_fence *)fence;
struct si_context *sctx;
int64_t abs_timeout = os_time_get_absolute_timeout(timeout);
}
}
- if (sfence->sdma) {
- if (!rws->fence_wait(rws, sfence->sdma, timeout))
- return false;
-
- /* Recompute the timeout after waiting. */
- if (timeout && timeout != PIPE_TIMEOUT_INFINITE) {
- int64_t time = os_time_get_nano();
- timeout = abs_timeout > time ? abs_timeout - time : 0;
- }
- }
-
if (!sfence->gfx)
return true;
{
struct si_screen *sscreen = (struct si_screen *)ctx->screen;
struct radeon_winsys *ws = sscreen->ws;
- struct si_multi_fence *sfence;
+ struct si_fence *sfence;
*pfence = NULL;
{
struct si_screen *sscreen = (struct si_screen *)screen;
struct radeon_winsys *ws = sscreen->ws;
- struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
- int gfx_fd = -1, sdma_fd = -1;
+ struct si_fence *sfence = (struct si_fence *)fence;
+ int gfx_fd = -1;
if (!sscreen->info.has_fence_to_handle)
return -1;
if (sfence->gfx_unflushed.ctx)
return -1;
- if (sfence->sdma) {
- sdma_fd = ws->fence_export_sync_file(ws, sfence->sdma);
- if (sdma_fd == -1)
- return -1;
- }
if (sfence->gfx) {
gfx_fd = ws->fence_export_sync_file(ws, sfence->gfx);
if (gfx_fd == -1) {
- if (sdma_fd != -1)
- close(sdma_fd);
return -1;
}
}
/* If we don't have FDs at this point, it means we don't have fences
* either. */
- if (sdma_fd == -1 && gfx_fd == -1)
- return ws->export_signalled_sync_file(ws);
- if (sdma_fd == -1)
- return gfx_fd;
if (gfx_fd == -1)
- return sdma_fd;
+ return ws->export_signalled_sync_file(ws);
- /* Get a fence that will be a combination of both fences. */
- sync_accumulate("radeonsi", &gfx_fd, sdma_fd);
- close(sdma_fd);
return gfx_fd;
}
struct si_context *sctx = (struct si_context *)ctx;
struct radeon_winsys *ws = sctx->ws;
struct pipe_fence_handle *gfx_fence = NULL;
- struct pipe_fence_handle *sdma_fence = NULL;
bool deferred_fence = false;
struct si_fine_fence fine = {};
unsigned rflags = PIPE_FLUSH_ASYNC;
si_fine_fence_set(sctx, &fine, flags);
}
- /* DMA IBs are preambles to gfx IBs, therefore must be flushed first. */
- if (sctx->sdma_cs.priv)
- si_flush_dma_cs(sctx, rflags, fence ? &sdma_fence : NULL);
-
if (force_flush) {
sctx->initial_gfx_cs_size = 0;
}
/* Both engines can signal out of order, so we need to keep both fences. */
if (fence) {
- struct si_multi_fence *multi_fence;
+ struct si_fence *new_fence;
if (flags & TC_FLUSH_ASYNC) {
- multi_fence = (struct si_multi_fence *)*fence;
- assert(multi_fence);
+ new_fence = (struct si_fence *)*fence;
+ assert(new_fence);
} else {
- multi_fence = si_create_multi_fence();
- if (!multi_fence) {
- ws->fence_reference(&sdma_fence, NULL);
+ new_fence = si_create_multi_fence();
+ if (!new_fence) {
ws->fence_reference(&gfx_fence, NULL);
goto finish;
}
screen->fence_reference(screen, fence, NULL);
- *fence = (struct pipe_fence_handle *)multi_fence;
+ *fence = (struct pipe_fence_handle *)new_fence;
}
/* If both fences are NULL, fence_finish will always return true. */
- multi_fence->gfx = gfx_fence;
- multi_fence->sdma = sdma_fence;
+ new_fence->gfx = gfx_fence;
if (deferred_fence) {
- multi_fence->gfx_unflushed.ctx = sctx;
- multi_fence->gfx_unflushed.ib_index = sctx->num_gfx_cs_flushes;
+ new_fence->gfx_unflushed.ctx = sctx;
+ new_fence->gfx_unflushed.ib_index = sctx->num_gfx_cs_flushes;
}
- multi_fence->fine = fine;
+ new_fence->fine = fine;
fine.buf = NULL;
if (flags & TC_FLUSH_ASYNC) {
- util_queue_fence_signal(&multi_fence->ready);
- tc_unflushed_batch_token_reference(&multi_fence->tc_token, NULL);
+ util_queue_fence_signal(&new_fence->ready);
+ tc_unflushed_batch_token_reference(&new_fence->tc_token, NULL);
}
}
assert(!fine.buf);
finish:
if (!(flags & (PIPE_FLUSH_DEFERRED | PIPE_FLUSH_ASYNC))) {
- if (sctx->sdma_cs.priv)
- ws->cs_sync_flush(&sctx->sdma_cs);
ws->cs_sync_flush(&sctx->gfx_cs);
}
}
static void si_fence_server_signal(struct pipe_context *ctx, struct pipe_fence_handle *fence)
{
struct si_context *sctx = (struct si_context *)ctx;
- struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
+ struct si_fence *sfence = (struct si_fence *)fence;
- /* We should have at least one syncobj to signal */
- assert(sfence->sdma || sfence->gfx);
+ assert(sfence->gfx);
- if (sfence->sdma)
- si_add_syncobj_signal(sctx, sfence->sdma);
if (sfence->gfx)
si_add_syncobj_signal(sctx, sfence->gfx);
static void si_fence_server_sync(struct pipe_context *ctx, struct pipe_fence_handle *fence)
{
struct si_context *sctx = (struct si_context *)ctx;
- struct si_multi_fence *sfence = (struct si_multi_fence *)fence;
+ struct si_fence *sfence = (struct si_fence *)fence;
util_queue_fence_wait(&sfence->ready);
* the time it takes to create and submit that IB, flushing decreases
* performance. Therefore, DO NOT FLUSH.
*/
- if (sfence->sdma)
- si_add_fence_dependency(sctx, sfence->sdma);
- if (sfence->gfx)
- si_add_fence_dependency(sctx, sfence->gfx);
+ si_add_fence_dependency(sctx, sfence->gfx);
}
void si_init_fence_functions(struct si_context *ctx)
{
struct radeon_cmdbuf *cs = &ctx->gfx_cs;
- /* There is no need to flush the DMA IB here, because
- * si_need_dma_space always flushes the GFX IB if there is
- * a conflict, which means any unflushed DMA commands automatically
- * precede the GFX IB (= they had no dependency on the GFX IB when
- * they were submitted).
- */
-
/* There are two memory usage counters in the winsys for all buffers
* that have been added (cs_add_buffer) and two counters in the pipe
* driver for those that haven't been added yet.
si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL);
}
-void si_unref_sdma_uploads(struct si_context *sctx)
-{
- for (unsigned i = 0; i < sctx->num_sdma_uploads; i++) {
- si_resource_reference(&sctx->sdma_uploads[i].dst, NULL);
- si_resource_reference(&sctx->sdma_uploads[i].src, NULL);
- }
- sctx->num_sdma_uploads = 0;
-}
-
void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence)
{
struct radeon_cmdbuf *cs = &ctx->gfx_cs;
ctx->gfx_flush_in_progress = true;
- /* If the gallium frontend is flushing the GFX IB, si_flush_from_st is
- * responsible for flushing the DMA IB and merging the fences from both.
- * If the driver flushes the GFX IB internally, and it should never ask
- * for a fence handle.
- */
- assert(!radeon_emitted(&ctx->sdma_cs, 0) || fence == NULL);
-
- /* Update the sdma_uploads list by flushing the uploader. */
- u_upload_unmap(ctx->b.const_uploader);
-
- /* Execute SDMA uploads. */
- ctx->sdma_uploads_in_progress = true;
- for (unsigned i = 0; i < ctx->num_sdma_uploads; i++) {
- struct si_sdma_upload *up = &ctx->sdma_uploads[i];
-
- assert(up->src_offset % 4 == 0 && up->dst_offset % 4 == 0 && up->size % 4 == 0);
-
- si_sdma_copy_buffer(ctx, &up->dst->b.b, &up->src->b.b, up->dst_offset, up->src_offset,
- up->size);
- }
- ctx->sdma_uploads_in_progress = false;
- si_unref_sdma_uploads(ctx);
-
- /* Flush SDMA (preamble IB). */
- if (radeon_emitted(&ctx->sdma_cs, 0))
- si_flush_dma_cs(ctx, flags, NULL);
-
if (radeon_emitted(&ctx->prim_discard_compute_cs, 0)) {
struct radeon_cmdbuf *compute_cs = &ctx->prim_discard_compute_cs;
si_compute_signal_gfx(ctx);
{"cache_stats", DBG(CACHE_STATS), "Print shader cache statistics."},
/* Driver options: */
- {"forcedma", DBG(FORCE_SDMA), "Use SDMA for all operations when possible."},
- {"nodma", DBG(NO_SDMA), "Disable SDMA"},
- {"nodmaclear", DBG(NO_SDMA_CLEARS), "Disable SDMA clears"},
- {"nodmacopyimage", DBG(NO_SDMA_COPY_IMAGE), "Disable SDMA image copies"},
{"nowc", DBG(NO_WC), "Disable GTT write combining"},
{"check_vm", DBG(CHECK_VM), "Check VM faults and dump debug info."},
{"reserve_vmid", DBG(RESERVE_VMID), "Force VMID reservation per context."},
static const struct debug_named_value test_options[] = {
/* Tests: */
- {"testdma", DBG(TEST_DMA), "Invoke SDMA tests and exit."},
+ {"testdma", DBG(TEST_DMA), "Invoke blit tests and exit."},
{"testvmfaultcp", DBG(TEST_VMFAULT_CP), "Invoke a CP VM fault test and exit."},
- {"testvmfaultsdma", DBG(TEST_VMFAULT_SDMA), "Invoke a SDMA VM fault test and exit."},
{"testvmfaultshader", DBG(TEST_VMFAULT_SHADER), "Invoke a shader VM fault test and exit."},
{"testdmaperf", DBG(TEST_DMA_PERF), "Test DMA performance"},
{"testgds", DBG(TEST_GDS), "Test GDS."},
sctx->b.delete_compute_state(&sctx->b, sctx->sh_query_result_shader);
sctx->ws->cs_destroy(&sctx->gfx_cs);
- sctx->ws->cs_destroy(&sctx->sdma_cs);
if (sctx->ctx)
sctx->ws->ctx_destroy(sctx->ctx);
u_suballocator_destroy(&sctx->allocator_zeroed_memory);
sctx->ws->fence_reference(&sctx->last_gfx_fence, NULL);
- sctx->ws->fence_reference(&sctx->last_sdma_fence, NULL);
sctx->ws->fence_reference(&sctx->last_ib_barrier_fence, NULL);
si_resource_reference(&sctx->eop_bug_scratch, NULL);
si_resource_reference(&sctx->eop_bug_scratch_tmz, NULL);
util_dynarray_fini(&sctx->resident_tex_needs_color_decompress);
util_dynarray_fini(&sctx->resident_img_needs_color_decompress);
util_dynarray_fini(&sctx->resident_tex_needs_depth_decompress);
- si_unref_sdma_uploads(sctx);
- free(sctx->sdma_uploads);
FREE(sctx);
}
if (!sctx->ctx)
goto fail;
- /* SDMA causes corruption on: :
- * - RX 580: https://gitlab.freedesktop.org/mesa/mesa/-/issues/1399, 1889
- * - gfx9 APUs: https://gitlab.freedesktop.org/mesa/mesa/-/issues/2814
- * - gfx10: https://gitlab.freedesktop.org/mesa/mesa/-/issues/1907,
- https://gitlab.freedesktop.org/drm/amd/issues/892
- *
- * While we could keep buffer copies and clears enabled, let's disable
- * everything because SDMA decreases CPU performance because of its
- * command submission overhead.
- *
- * And SDMA is disabled on all chips (instead of just the ones listed above),
- * because it doesn't make sense to keep it enabled on old chips only
- * that are not tested as often as newer chips.
- */
- if (sscreen->info.num_rings[RING_DMA] && !(sscreen->debug_flags & DBG(NO_SDMA)) &&
- sscreen->debug_flags & DBG(FORCE_SDMA)) {
- sctx->ws->cs_create(&sctx->sdma_cs, sctx->ctx, RING_DMA, (void *)si_flush_dma_cs,
- sctx, stop_exec_on_failure);
- }
-
- bool use_sdma_upload = sscreen->info.has_dedicated_vram && sctx->sdma_cs.priv;
sctx->b.const_uploader =
u_upload_create(&sctx->b, 256 * 1024, 0, PIPE_USAGE_DEFAULT,
- SI_RESOURCE_FLAG_32BIT |
- (use_sdma_upload ? SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA : 0));
+ SI_RESOURCE_FLAG_32BIT);
if (!sctx->b.const_uploader)
goto fail;
- if (use_sdma_upload)
- u_upload_enable_flush_explicit(sctx->b.const_uploader);
-
ws->cs_create(&sctx->gfx_cs, sctx->ctx, sctx->has_graphics ? RING_GFX : RING_COMPUTE,
(void *)si_flush_gfx_cs, sctx, stop_exec_on_failure);
sctx->prim_discard_vertex_count_threshold = UINT_MAX;
}
- /* Initialize SDMA functions. */
- if (sctx->chip_class >= GFX7)
- cik_init_sdma_functions(sctx);
- else
- sctx->dma_copy = si_resource_copy_region;
-
- if (sscreen->debug_flags & DBG(FORCE_SDMA))
- sctx->b.resource_copy_region = sctx->dma_copy;
-
sctx->sample_mask = 0xffff;
/* Initialize multimedia functions. */
ctx->flush(ctx, NULL, 0);
puts("VM fault test: CP - done.");
}
- if (test_flags & DBG(TEST_VMFAULT_SDMA)) {
- si_sdma_clear_buffer(sctx, buf, 0, 4, 0);
- ctx->flush(ctx, NULL, 0);
- puts("VM fault test: SDMA - done.");
- }
if (test_flags & DBG(TEST_VMFAULT_SHADER)) {
util_test_constant_buffer(ctx, buf);
puts("VM fault test: Shader - done.");
si_test_dma_perf(sscreen);
}
- if (test_flags & (DBG(TEST_VMFAULT_CP) | DBG(TEST_VMFAULT_SDMA) | DBG(TEST_VMFAULT_SHADER)))
+ if (test_flags & (DBG(TEST_VMFAULT_CP) | DBG(TEST_VMFAULT_SHADER)))
si_test_vmfault(sscreen, test_flags);
if (test_flags & DBG(TEST_GDS))
#define SI_RESOURCE_FLAG_READ_ONLY (PIPE_RESOURCE_FLAG_DRV_PRIV << 5)
#define SI_RESOURCE_FLAG_32BIT (PIPE_RESOURCE_FLAG_DRV_PRIV << 6)
#define SI_RESOURCE_FLAG_CLEAR (PIPE_RESOURCE_FLAG_DRV_PRIV << 7)
-/* For const_uploader, upload data via GTT and copy to VRAM on context flush via SDMA. */
-#define SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA (PIPE_RESOURCE_FLAG_DRV_PRIV << 8)
+/* gap */
/* Set a micro tile mode: */
#define SI_RESOURCE_FLAG_FORCE_MICRO_TILE_MODE (PIPE_RESOURCE_FLAG_DRV_PRIV << 9)
#define SI_RESOURCE_FLAG_MICRO_TILE_MODE_SHIFT (util_logbase2(PIPE_RESOURCE_FLAG_DRV_PRIV) + 10)
DBG_CACHE_STATS,
/* Driver options: */
- DBG_FORCE_SDMA,
- DBG_NO_SDMA,
- DBG_NO_SDMA_CLEARS,
- DBG_NO_SDMA_COPY_IMAGE,
DBG_NO_WC,
DBG_CHECK_VM,
DBG_RESERVE_VMID,
/* Tests: */
DBG_TEST_DMA,
DBG_TEST_VMFAULT_CP,
- DBG_TEST_VMFAULT_SDMA,
DBG_TEST_VMFAULT_SHADER,
DBG_TEST_DMA_PERF,
DBG_TEST_GDS,
int64_t time_flush;
};
-struct si_sdma_upload {
- struct si_resource *dst;
- struct si_resource *src;
- unsigned src_offset;
- unsigned dst_offset;
- unsigned size;
-};
-
struct si_small_prim_cull_info {
float scale[2], translate[2];
};
struct radeon_winsys *ws;
struct radeon_winsys_ctx *ctx;
struct radeon_cmdbuf gfx_cs; /* compute IB if graphics is disabled */
- struct radeon_cmdbuf sdma_cs;
struct pipe_fence_handle *last_gfx_fence;
- struct pipe_fence_handle *last_sdma_fence;
struct si_resource *eop_bug_scratch;
struct si_resource *eop_bug_scratch_tmz;
struct u_upload_mgr *cached_gtt_allocator;
unsigned num_spill_draw_calls;
unsigned num_compute_calls;
unsigned num_spill_compute_calls;
- unsigned num_dma_calls;
unsigned num_cp_dma_calls;
unsigned num_vs_flushes;
unsigned num_ps_flushes;
bool render_cond_invert;
bool render_cond_force_off; /* for u_blitter */
- /* For uploading data via GTT and copy to VRAM on context flush via SDMA. */
- bool sdma_uploads_in_progress;
- struct si_sdma_upload *sdma_uploads;
- unsigned num_sdma_uploads;
- unsigned max_sdma_uploads;
-
/* Shader-based queries. */
struct list_head shader_query_buffers;
unsigned num_active_shader_queries;
bool query_active;
} dcc_stats[5];
- /* Copy one resource to another using async DMA. */
- void (*dma_copy)(struct pipe_context *ctx, struct pipe_resource *dst, unsigned dst_level,
- unsigned dst_x, unsigned dst_y, unsigned dst_z, struct pipe_resource *src,
- unsigned src_level, const struct pipe_box *src_box);
-
struct si_tracked_regs tracked_regs;
/* Resources that need to be flushed, but will not get an explicit
struct hash_table *dirty_implicit_resources;
};
-/* cik_sdma.c */
-void cik_init_sdma_functions(struct si_context *sctx);
-
/* si_blit.c */
enum si_blitter_op /* bitmask */
{
enum ring_type ring);
bool si_replace_shader(unsigned num, struct si_shader_binary *binary);
-/* si_dma_cs.c */
-void si_dma_emit_timestamp(struct si_context *sctx, struct si_resource *dst, uint64_t offset);
-void si_sdma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset,
- uint64_t size, unsigned clear_value);
-void si_sdma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst,
- struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset,
- uint64_t size);
-void si_need_dma_space(struct si_context *ctx, unsigned num_dw, struct si_resource *dst,
- struct si_resource *src);
-void si_flush_dma_cs(struct si_context *ctx, unsigned flags, struct pipe_fence_handle **fence);
-
/* si_fence.c */
void si_cp_release_mem(struct si_context *ctx, struct radeon_cmdbuf *cs, unsigned event,
unsigned event_flags, unsigned dst_sel, unsigned int_sel, unsigned data_sel,
void si_set_tracked_regs_to_clear_state(struct si_context *ctx);
void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs);
void si_need_gfx_cs_space(struct si_context *ctx, unsigned num_draws);
-void si_unref_sdma_uploads(struct si_context *sctx);
/* si_gpu_load.c */
void si_gpu_load_kill_thread(struct si_screen *sscreen);
void si_init_viewport_functions(struct si_context *ctx);
/* si_texture.c */
-bool si_prepare_for_dma_blit(struct si_context *sctx, struct si_texture *dst, unsigned dst_level,
- unsigned dstx, unsigned dsty, unsigned dstz, struct si_texture *src,
- unsigned src_level, const struct pipe_box *src_box);
void si_eliminate_fast_color_clear(struct si_context *sctx, struct si_texture *tex,
bool *ctx_flushed);
void si_texture_discard_cmask(struct si_screen *sscreen, struct si_texture *tex);
* - if si_context_add_resource_size has been called for the buffer
* followed by *_need_cs_space for checking the memory usage
*
- * - if si_need_dma_space has been called for the buffer
- *
* - when emitting state packets and draw packets (because preceding packets
* can't be re-emitted at that point)
*
return RADEON_NUM_MAPPED_BUFFERS;
case SI_QUERY_NUM_GFX_IBS:
return RADEON_NUM_GFX_IBS;
- case SI_QUERY_NUM_SDMA_IBS:
- return RADEON_NUM_SDMA_IBS;
case SI_QUERY_GFX_BO_LIST_SIZE:
return RADEON_GFX_BO_LIST_COUNTER;
case SI_QUERY_GFX_IB_SIZE:
}
}
-static int64_t si_finish_dma_get_cpu_time(struct si_context *sctx)
-{
- struct pipe_fence_handle *fence = NULL;
-
- si_flush_dma_cs(sctx, 0, &fence);
- if (fence) {
- sctx->ws->fence_wait(sctx->ws, fence, PIPE_TIMEOUT_INFINITE);
- sctx->ws->fence_reference(&fence, NULL);
- }
-
- return os_time_get_nano();
-}
-
static bool si_query_sw_begin(struct si_context *sctx, struct si_query *squery)
{
struct si_query_sw *query = (struct si_query_sw *)squery;
case PIPE_QUERY_TIMESTAMP_DISJOINT:
case PIPE_QUERY_GPU_FINISHED:
break;
- case SI_QUERY_TIME_ELAPSED_SDMA_SI:
- query->begin_result = si_finish_dma_get_cpu_time(sctx);
- break;
case SI_QUERY_DRAW_CALLS:
query->begin_result = sctx->num_draw_calls;
break;
case SI_QUERY_SPILL_COMPUTE_CALLS:
query->begin_result = sctx->num_spill_compute_calls;
break;
- case SI_QUERY_DMA_CALLS:
- query->begin_result = sctx->num_dma_calls;
- break;
case SI_QUERY_CP_DMA_CALLS:
query->begin_result = sctx->num_cp_dma_calls;
break;
case SI_QUERY_BUFFER_WAIT_TIME:
case SI_QUERY_GFX_IB_SIZE:
case SI_QUERY_NUM_GFX_IBS:
- case SI_QUERY_NUM_SDMA_IBS:
case SI_QUERY_NUM_BYTES_MOVED:
case SI_QUERY_NUM_EVICTIONS:
case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
case PIPE_QUERY_GPU_FINISHED:
sctx->b.flush(&sctx->b, &query->fence, PIPE_FLUSH_DEFERRED);
break;
- case SI_QUERY_TIME_ELAPSED_SDMA_SI:
- query->end_result = si_finish_dma_get_cpu_time(sctx);
- break;
case SI_QUERY_DRAW_CALLS:
query->end_result = sctx->num_draw_calls;
break;
case SI_QUERY_SPILL_COMPUTE_CALLS:
query->end_result = sctx->num_spill_compute_calls;
break;
- case SI_QUERY_DMA_CALLS:
- query->end_result = sctx->num_dma_calls;
- break;
case SI_QUERY_CP_DMA_CALLS:
query->end_result = sctx->num_cp_dma_calls;
break;
case SI_QUERY_GFX_IB_SIZE:
case SI_QUERY_NUM_MAPPED_BUFFERS:
case SI_QUERY_NUM_GFX_IBS:
- case SI_QUERY_NUM_SDMA_IBS:
case SI_QUERY_NUM_BYTES_MOVED:
case SI_QUERY_NUM_EVICTIONS:
case SI_QUERY_NUM_VRAM_CPU_PAGE_FAULTS: {
query->result_size += 16; /* for the fence + alignment */
query->b.num_cs_dw_suspend = 6 + si_cp_write_fence_dwords(sscreen);
break;
- case SI_QUERY_TIME_ELAPSED_SDMA:
- /* GET_GLOBAL_TIMESTAMP only works if the offset is a multiple of 32. */
- query->result_size = 64;
- break;
case PIPE_QUERY_TIME_ELAPSED:
query->result_size = 24;
query->b.num_cs_dw_suspend = 8 + si_cp_write_fence_dwords(sscreen);
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
switch (query->b.type) {
- case SI_QUERY_TIME_ELAPSED_SDMA:
- si_dma_emit_timestamp(sctx, buffer, va - buffer->gpu_address);
- return;
case PIPE_QUERY_OCCLUSION_COUNTER:
case PIPE_QUERY_OCCLUSION_PREDICATE:
case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
if (query->b.type == PIPE_QUERY_PIPELINE_STATISTICS)
sctx->num_pipeline_stat_queries++;
- if (query->b.type != SI_QUERY_TIME_ELAPSED_SDMA)
- si_need_gfx_cs_space(sctx, 0);
+ si_need_gfx_cs_space(sctx, 0);
va = query->buffer.buf->gpu_address + query->buffer.results_end;
query->ops->emit_start(sctx, query, query->buffer.buf, va);
uint64_t fence_va = 0;
switch (query->b.type) {
- case SI_QUERY_TIME_ELAPSED_SDMA:
- si_dma_emit_timestamp(sctx, buffer, va + 32 - buffer->gpu_address);
- return;
case PIPE_QUERY_OCCLUSION_COUNTER:
case PIPE_QUERY_OCCLUSION_PREDICATE:
case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
struct si_screen *sscreen = (struct si_screen *)ctx->screen;
if (query_type == PIPE_QUERY_TIMESTAMP_DISJOINT || query_type == PIPE_QUERY_GPU_FINISHED ||
- (query_type >= PIPE_QUERY_DRIVER_SPECIFIC && query_type != SI_QUERY_TIME_ELAPSED_SDMA))
+ (query_type >= PIPE_QUERY_DRIVER_SPECIFIC))
return si_query_sw_create(query_type);
if (sscreen->use_ngg_streamout &&
case PIPE_QUERY_TIME_ELAPSED:
result->u64 += si_query_read_result(buffer, 0, 2, false);
break;
- case SI_QUERY_TIME_ELAPSED_SDMA:
- result->u64 += si_query_read_result(buffer, 0, 32 / 4, false);
- break;
case PIPE_QUERY_TIMESTAMP:
result->u64 = *(uint64_t *)buffer;
break;
}
/* Convert the time to expected units. */
- if (squery->type == PIPE_QUERY_TIME_ELAPSED || squery->type == SI_QUERY_TIME_ELAPSED_SDMA ||
+ if (squery->type == PIPE_QUERY_TIME_ELAPSED ||
squery->type == PIPE_QUERY_TIMESTAMP) {
result->u64 = (1000000 * result->u64) / sscreen->info.clock_crystal_freq;
}
X("spill-draw-calls", SPILL_DRAW_CALLS, UINT64, AVERAGE),
X("compute-calls", COMPUTE_CALLS, UINT64, AVERAGE),
X("spill-compute-calls", SPILL_COMPUTE_CALLS, UINT64, AVERAGE),
- X("dma-calls", DMA_CALLS, UINT64, AVERAGE),
X("cp-dma-calls", CP_DMA_CALLS, UINT64, AVERAGE),
X("num-vs-flushes", NUM_VS_FLUSHES, UINT64, AVERAGE),
X("num-ps-flushes", NUM_PS_FLUSHES, UINT64, AVERAGE),
X("buffer-wait-time", BUFFER_WAIT_TIME, MICROSECONDS, CUMULATIVE),
X("num-mapped-buffers", NUM_MAPPED_BUFFERS, UINT64, AVERAGE),
X("num-GFX-IBs", NUM_GFX_IBS, UINT64, AVERAGE),
- X("num-SDMA-IBs", NUM_SDMA_IBS, UINT64, AVERAGE),
X("GFX-BO-list-size", GFX_BO_LIST_SIZE, UINT64, AVERAGE),
X("GFX-IB-size", GFX_IB_SIZE, UINT64, AVERAGE),
X("num-bytes-moved", NUM_BYTES_MOVED, BYTES, CUMULATIVE),
SI_QUERY_SPILL_DRAW_CALLS,
SI_QUERY_COMPUTE_CALLS,
SI_QUERY_SPILL_COMPUTE_CALLS,
- SI_QUERY_DMA_CALLS,
SI_QUERY_CP_DMA_CALLS,
SI_QUERY_NUM_VS_FLUSHES,
SI_QUERY_NUM_PS_FLUSHES,
SI_QUERY_BUFFER_WAIT_TIME,
SI_QUERY_NUM_MAPPED_BUFFERS,
SI_QUERY_NUM_GFX_IBS,
- SI_QUERY_NUM_SDMA_IBS,
SI_QUERY_GFX_BO_LIST_SIZE,
SI_QUERY_GFX_IB_SIZE,
SI_QUERY_NUM_BYTES_MOVED,
SI_QUERY_GPIN_NUM_RB,
SI_QUERY_GPIN_NUM_SPI,
SI_QUERY_GPIN_NUM_SE,
- SI_QUERY_TIME_ELAPSED_SDMA,
- SI_QUERY_TIME_ELAPSED_SDMA_SI, /* emulated, measured on the CPU */
SI_QUERY_PD_NUM_PRIMS_ACCEPTED,
SI_QUERY_PD_NUM_PRIMS_REJECTED,
SI_QUERY_PD_NUM_PRIMS_INELIGIBLE,
*
*/
-/* This file implements randomized SDMA texture blit tests. */
+/* This file implements randomized texture blit tests. */
#include "si_pipe.h"
#include "util/rand_xor.h"
struct si_texture *ssrc;
struct cpu_texture src_cpu, dst_cpu;
unsigned max_width, max_height, max_depth, j, num;
- unsigned gfx_blits = 0, dma_blits = 0, cs_blits = 0, max_tex_side_gen;
+ unsigned gfx_blits = 0, cs_blits = 0, max_tex_side_gen;
unsigned max_tex_layers;
bool pass;
bool do_partial_copies = rand() & 1;
int srcx, srcy, srcz, dstx, dsty, dstz;
struct pipe_box box;
unsigned old_num_draw_calls = sctx->num_draw_calls;
- unsigned old_num_dma_calls = sctx->num_dma_calls;
unsigned old_num_cs_calls = sctx->num_compute_calls;
if (!do_partial_copies) {
/* GPU copy */
u_box_3d(srcx, srcy, srcz, width, height, depth, &box);
- sctx->dma_copy(ctx, dst, 0, dstx, dsty, dstz, src, 0, &box);
+ si_resource_copy_region(ctx, dst, 0, dstx, dsty, dstz, src, 0, &box);
/* See which engine was used. */
gfx_blits += sctx->num_draw_calls > old_num_draw_calls;
- dma_blits += sctx->num_dma_calls > old_num_dma_calls;
cs_blits += sctx->num_compute_calls > old_num_cs_calls;
/* CPU copy */
else
num_fail++;
- printf("BLITs: GFX = %2u, DMA = %2u, CS = %2u, %s [%u/%u]\n", gfx_blits, dma_blits, cs_blits,
+ printf("BLITs: GFX = %2u, CS = %2u, %s [%u/%u]\n", gfx_blits, cs_blits,
pass ? "pass" : "fail", num_pass, num_pass + num_fail);
/* cleanup */
static const unsigned cs_waves_per_sh_list[] = {0, 4, 8, 16};
#define NUM_SHADERS ARRAY_SIZE(cs_dwords_per_thread_list)
-#define NUM_METHODS (4 + 3 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
+#define NUM_METHODS (3 + 3 * NUM_SHADERS * ARRAY_SIZE(cs_waves_per_sh_list))
static const char *method_str[] = {
"CP MC ",
"CP L2 ",
"CP L2 ",
- "SDMA ",
};
static const char *placement_str[] = {
/* Clear */
struct si_result {
bool is_valid;
bool is_cp;
- bool is_sdma;
bool is_cs;
unsigned cache_policy;
unsigned dwords_per_thread;
for (unsigned method = 0; method < NUM_METHODS; method++) {
bool test_cp = method <= 2;
- bool test_sdma = method == 3;
- bool test_cs = method >= 4;
- unsigned cs_method = method - 4;
+ bool test_cs = method >= 3;
+ unsigned cs_method = method - 3;
unsigned cs_waves_per_sh =
test_cs ? cs_waves_per_sh_list[cs_method / (3 * NUM_SHADERS)] : 0;
cs_method %= 3 * NUM_SHADERS;
unsigned cs_dwords_per_thread =
test_cs ? cs_dwords_per_thread_list[cs_method % NUM_SHADERS] : 0;
- if (test_sdma && !sctx->sdma_cs.priv)
- continue;
-
if (sctx->chip_class == GFX6) {
/* GFX6 doesn't support CP DMA operations through L2. */
if (test_cp && cache_policy != L2_BYPASS)
unsigned query_type = PIPE_QUERY_TIME_ELAPSED;
unsigned flags = cache_policy == L2_BYPASS ? SI_RESOURCE_FLAG_UNCACHED : 0;
- if (test_sdma) {
- if (sctx->chip_class == GFX6)
- query_type = SI_QUERY_TIME_ELAPSED_SDMA_SI;
- else
- query_type = SI_QUERY_TIME_ELAPSED_SDMA;
- }
-
if (placement == 0 || placement == 2 || placement == 4)
dst_usage = PIPE_USAGE_DEFAULT;
else
si_cp_dma_clear_buffer(sctx, &sctx->gfx_cs, dst, 0, size, clear_value, 0,
SI_COHERENCY_NONE, cache_policy);
}
- } else if (test_sdma) {
- /* SDMA */
- if (is_copy) {
- si_sdma_copy_buffer(sctx, dst, src, 0, 0, size);
- } else {
- si_sdma_clear_buffer(sctx, dst, 0, size, clear_value);
- }
} else {
/* Compute */
/* The memory accesses are coalesced, meaning that the 1st instruction writes
}
/* Flush L2, so that we don't just test L2 cache performance except for L2_LRU. */
- if (!test_sdma) {
- sctx->flags |= SI_CONTEXT_INV_VCACHE |
- (cache_policy == L2_LRU ? 0 : SI_CONTEXT_INV_L2) |
- SI_CONTEXT_CS_PARTIAL_FLUSH;
- sctx->emit_cache_flush(sctx);
- }
+ sctx->flags |= SI_CONTEXT_INV_VCACHE |
+ (cache_policy == L2_LRU ? 0 : SI_CONTEXT_INV_L2) |
+ SI_CONTEXT_CS_PARTIAL_FLUSH;
+ sctx->emit_cache_flush(sctx);
}
ctx->end_query(ctx, q);
struct si_result *r = &results[util_logbase2(size)][placement][method];
r->is_valid = true;
r->is_cp = test_cp;
- r->is_sdma = test_sdma;
r->is_cs = test_cs;
r->cache_policy = cache_policy;
r->dwords_per_thread = cs_dwords_per_thread;
bool cached = mode == 1;
if (async)
- puts(" if (async) { /* SDMA or async compute */");
+ puts(" if (async) { /* async compute */");
else if (cached)
puts(" if (cached) { /* gfx ring */");
else
if (r->is_cs && r->waves_per_sh == 0)
continue;
} else {
- /* SDMA is always asynchronous */
- if (r->is_sdma)
- continue;
-
if (cached && r->cache_policy == L2_BYPASS)
continue;
if (!cached && r->cache_policy == L2_LRU)
*/
if (!best ||
/* If it's the same method as for the previous size: */
- (prev->is_cp == best->is_cp && prev->is_sdma == best->is_sdma &&
+ (prev->is_cp == best->is_cp &&
prev->is_cs == best->is_cs && prev->cache_policy == best->cache_policy &&
prev->dwords_per_thread == best->dwords_per_thread &&
prev->waves_per_sh == best->waves_per_sh) ||
if (best->is_cp) {
printf("CP_DMA(%s);\n", cache_policy_str);
}
- if (best->is_sdma)
- printf("SDMA;\n");
if (best->is_cs) {
printf("COMPUTE(%s, %u, %u);\n", cache_policy_str,
best->dwords_per_thread, best->waves_per_sh);
static bool si_texture_is_aux_plane(const struct pipe_resource *resource);
-bool si_prepare_for_dma_blit(struct si_context *sctx, struct si_texture *dst, unsigned dst_level,
- unsigned dstx, unsigned dsty, unsigned dstz, struct si_texture *src,
- unsigned src_level, const struct pipe_box *src_box)
-{
- if (!sctx->sdma_cs.priv)
- return false;
-
- if (dst->surface.bpe != src->surface.bpe)
- return false;
-
- /* MSAA: Blits don't exist in the real world. */
- if (src->buffer.b.b.nr_samples > 1 || dst->buffer.b.b.nr_samples > 1)
- return false;
-
- /* Depth-stencil surfaces:
- * When dst is linear, the DB->CB copy preserves HTILE.
- * When dst is tiled, the 3D path must be used to update HTILE.
- */
- if (src->is_depth || dst->is_depth)
- return false;
-
- /* DCC as:
- * src: Use the 3D path. DCC decompression is expensive.
- * dst: Use the 3D path to compress the pixels with DCC.
- */
- if (vi_dcc_enabled(src, src_level) || vi_dcc_enabled(dst, dst_level))
- return false;
-
- /* TMZ: mixing encrypted and non-encrypted buffer in a single command
- * doesn't seem supported.
- */
- if ((src->buffer.flags & RADEON_FLAG_ENCRYPTED) !=
- (dst->buffer.flags & RADEON_FLAG_ENCRYPTED))
- return false;
-
- /* CMASK as:
- * src: Both texture and SDMA paths need decompression. Use SDMA.
- * dst: If overwriting the whole texture, discard CMASK and use
- * SDMA. Otherwise, use the 3D path.
- */
- if (dst->cmask_buffer && dst->dirty_level_mask & (1 << dst_level)) {
- /* The CMASK clear is only enabled for the first level. */
- assert(dst_level == 0);
- if (!util_texrange_covers_whole_level(&dst->buffer.b.b, dst_level, dstx, dsty, dstz,
- src_box->width, src_box->height, src_box->depth))
- return false;
-
- si_texture_discard_cmask(sctx->screen, dst);
- }
-
- /* All requirements are met. Prepare textures for SDMA. */
- if (src->cmask_buffer && src->dirty_level_mask & (1 << src_level))
- sctx->b.flush_resource(&sctx->b, &src->buffer.b.b);
-
- assert(!(src->dirty_level_mask & (1 << src_level)));
- assert(!(dst->dirty_level_mask & (1 << dst_level)));
-
- return true;
-}
-
/* Same as resource_copy_region, except that both upsampling and downsampling are allowed. */
static void si_copy_region_with_blit(struct pipe_context *pipe, struct pipe_resource *dst,
unsigned dst_level, unsigned dstx, unsigned dsty,
/* Copy from a full GPU texture to a transfer's staging one. */
static void si_copy_to_staging_texture(struct pipe_context *ctx, struct si_transfer *stransfer)
{
- struct si_context *sctx = (struct si_context *)ctx;
struct pipe_transfer *transfer = (struct pipe_transfer *)stransfer;
struct pipe_resource *dst = &stransfer->staging->b.b;
struct pipe_resource *src = transfer->resource;
return;
}
- sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, transfer->level, &transfer->box);
+ si_resource_copy_region(ctx, dst, 0, 0, 0, 0, src, transfer->level, &transfer->box);
}
/* Copy from a transfer's staging texture to a full GPU one. */
static void si_copy_from_staging_texture(struct pipe_context *ctx, struct si_transfer *stransfer)
{
- struct si_context *sctx = (struct si_context *)ctx;
struct pipe_transfer *transfer = (struct pipe_transfer *)stransfer;
struct pipe_resource *dst = transfer->resource;
struct pipe_resource *src = &stransfer->staging->b.b;
sbox.height = util_format_get_nblocksx(dst->format, sbox.height);
}
- sctx->dma_copy(ctx, dst, transfer->level, transfer->box.x, transfer->box.y, transfer->box.z, src,
- 0, &sbox);
+ si_resource_copy_region(ctx, dst, transfer->level, transfer->box.x, transfer->box.y,
+ transfer->box.z, src, 0, &sbox);
}
static unsigned si_texture_get_offset(struct si_screen *sscreen, struct si_texture *tex,
u_box_3d(0, 0, 0, u_minify(templ.width0, i), u_minify(templ.height0, i),
util_num_layers(&templ, i), &box);
- sctx->dma_copy(&sctx->b, &new_tex->buffer.b.b, i, 0, 0, 0, &tex->buffer.b.b, i, &box);
+ si_resource_copy_region(&sctx->b, &new_tex->buffer.b.b,
+ i, 0, 0, 0, &tex->buffer.b.b, i, &box);
}
}
struct si_context *sctx = (struct si_context *)sscreen->aux_context;
simple_mtx_lock(&sscreen->aux_context_lock);
- si_sdma_copy_buffer(sctx, &tex->dcc_retile_buffer->b.b, &buf->b.b, 0,
- 0, buf->b.b.width0);
+ si_copy_buffer(sctx, &tex->dcc_retile_buffer->b.b, &buf->b.b, 0,
+ 0, buf->b.b.width0);
sscreen->aux_context->flush(sscreen->aux_context, NULL, 0);
simple_mtx_unlock(&sscreen->aux_context_lock);