compute_min_size = 4 * 1024;
}
+ /* TODO: use compute for unaligned big sizes */
if (method == SI_AUTO_SELECT_CLEAR_METHOD && (
clear_value_size > 4 ||
(clear_value_size == 4 && offset % 4 == 0 && size > compute_min_size))) {
si_improve_sync_flags(sctx, dst, src, &flags);
/* Only use compute for VRAM copies on dGPUs. */
+ /* TODO: use compute for unaligned big sizes */
if (sctx->screen->info.has_dedicated_vram && si_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
si_resource(src)->domains & RADEON_DOMAIN_VRAM && size > compute_min_size &&
dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0) {
static inline unsigned cp_dma_max_byte_count(struct si_context *sctx)
{
unsigned max =
+ sctx->chip_class >= GFX11 ? 32767 :
sctx->chip_class >= GFX9 ? S_415_BYTE_COUNT_GFX9(~0u) : S_415_BYTE_COUNT_GFX6(~0u);
/* make it aligned for optimal performance */
assert(sctx->chip_class >= GFX7);
+ if (sctx->chip_class >= GFX11)
+ size = MIN2(size, 32768 - SI_CPDMA_ALIGNMENT);
+
/* The prefetch address and size must be aligned, so that we don't have to apply
* the complicated hw bug workaround.
*