From 3ca2001b537a2709e7ef60410e7dfad5d38663f4 Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Sat, 30 Jan 2016 10:02:43 -0500 Subject: [PATCH] nv50,nvc0: fix buffer clearing to respect engine alignment requirements It appears that the nvidia render engine is quite picky when it comes to linear surfaces. It doesn't like non-256-byte aligned offsets, and apparently doesn't even do non-256-byte strides. This makes arb_clear_buffer_object-unaligned pass on both nv50 and nvc0. As a side-effect this also allows RGB32 clears to work via GPU data upload instead of synchronizing the buffer to the CPU (nvc0 only). Signed-off-by: Ilia Mirkin # tested on GF108, GT215 Tested-by: Nick Sarnie # GK208 Cc: mesa-stable@lists.freedesktop.org --- src/gallium/drivers/nouveau/nv50/nv50_surface.c | 120 +++++++++++++--- src/gallium/drivers/nouveau/nvc0/nvc0_surface.c | 179 +++++++++++++++++++----- 2 files changed, 247 insertions(+), 52 deletions(-) diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c index 86be1b4..ec5cf37 100644 --- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c +++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c @@ -595,6 +595,82 @@ nv50_clear(struct pipe_context *pipe, unsigned buffers, } static void +nv50_clear_buffer_push(struct pipe_context *pipe, + struct pipe_resource *res, + unsigned offset, unsigned size, + const void *data, int data_size) +{ + struct nv50_context *nv50 = nv50_context(pipe); + struct nouveau_pushbuf *push = nv50->base.pushbuf; + struct nv04_resource *buf = nv04_resource(res); + unsigned count = (size + 3) / 4; + unsigned xcoord = offset & 0xff; + unsigned tmp, i; + + if (data_size == 1) { + tmp = *(unsigned char *)data; + tmp = (tmp << 24) | (tmp << 16) | (tmp << 8) | tmp; + data = &tmp; + data_size = 4; + } else if (data_size == 2) { + tmp = *(unsigned short *)data; + tmp = (tmp << 16) | tmp; + data = &tmp; + data_size = 4; + } + + unsigned data_words = data_size / 4; + + nouveau_bufctx_refn(nv50->bufctx, 0, buf->bo, buf->domain | NOUVEAU_BO_WR); + nouveau_pushbuf_bufctx(push, nv50->bufctx); + nouveau_pushbuf_validate(push); + + offset &= ~0xff; + + BEGIN_NV04(push, NV50_2D(DST_FORMAT), 2); + PUSH_DATA (push, NV50_SURFACE_FORMAT_R8_UNORM); + PUSH_DATA (push, 1); + BEGIN_NV04(push, NV50_2D(DST_PITCH), 5); + PUSH_DATA (push, 262144); + PUSH_DATA (push, 65536); + PUSH_DATA (push, 1); + PUSH_DATAh(push, buf->address + offset); + PUSH_DATA (push, buf->address + offset); + BEGIN_NV04(push, NV50_2D(SIFC_BITMAP_ENABLE), 2); + PUSH_DATA (push, 0); + PUSH_DATA (push, NV50_SURFACE_FORMAT_R8_UNORM); + BEGIN_NV04(push, NV50_2D(SIFC_WIDTH), 10); + PUSH_DATA (push, size); + PUSH_DATA (push, 1); + PUSH_DATA (push, 0); + PUSH_DATA (push, 1); + PUSH_DATA (push, 0); + PUSH_DATA (push, 1); + PUSH_DATA (push, 0); + PUSH_DATA (push, xcoord); + PUSH_DATA (push, 0); + PUSH_DATA (push, 0); + + while (count) { + unsigned nr_data = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN) / data_words; + unsigned nr = nr_data * data_words; + + BEGIN_NI04(push, NV50_2D(SIFC_DATA), nr); + for (i = 0; i < nr_data; i++) + PUSH_DATAp(push, data, data_words); + + count -= nr; + } + + if (buf->mm) { + nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence); + nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence_wr); + } + + nouveau_bufctx_reset(nv50->bufctx, 0); +} + +static void nv50_clear_buffer(struct pipe_context *pipe, struct pipe_resource *res, unsigned offset, unsigned size, @@ -643,9 +719,22 @@ nv50_clear_buffer(struct pipe_context *pipe, assert(size % data_size == 0); + if (offset & 0xff) { + unsigned fixup_size = MIN2(size, align(offset, 0x100) - offset); + assert(fixup_size % data_size == 0); + nv50_clear_buffer_push(pipe, res, offset, fixup_size, data, data_size); + offset += fixup_size; + size -= fixup_size; + if (!size) + return; + } + elements = size / data_size; height = (elements + 8191) / 8192; width = elements / height; + if (height > 1) + width &= ~0xff; + assert(width > 0); BEGIN_NV04(push, NV50_3D(CLEAR_COLOR(0)), 4); PUSH_DATAf(push, color.f[0]); @@ -669,13 +758,13 @@ nv50_clear_buffer(struct pipe_context *pipe, BEGIN_NV04(push, NV50_3D(RT_CONTROL), 1); PUSH_DATA (push, 1); BEGIN_NV04(push, NV50_3D(RT_ADDRESS_HIGH(0)), 5); - PUSH_DATAh(push, buf->bo->offset + buf->offset + offset); - PUSH_DATA (push, buf->bo->offset + buf->offset + offset); + PUSH_DATAh(push, buf->address + offset); + PUSH_DATA (push, buf->address + offset); PUSH_DATA (push, nv50_format_table[dst_fmt].rt); PUSH_DATA (push, 0); PUSH_DATA (push, 0); BEGIN_NV04(push, NV50_3D(RT_HORIZ(0)), 2); - PUSH_DATA (push, NV50_3D_RT_HORIZ_LINEAR | (width * data_size)); + PUSH_DATA (push, NV50_3D_RT_HORIZ_LINEAR | align(width * data_size, 0x100)); PUSH_DATA (push, height); BEGIN_NV04(push, NV50_3D(ZETA_ENABLE), 1); PUSH_DATA (push, 0); @@ -694,26 +783,21 @@ nv50_clear_buffer(struct pipe_context *pipe, BEGIN_NI04(push, NV50_3D(CLEAR_BUFFERS), 1); PUSH_DATA (push, 0x3c); + BEGIN_NV04(push, NV50_3D(COND_MODE), 1); + PUSH_DATA (push, nv50->cond_condmode); + + if (buf->mm) { + nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence); + nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence_wr); + } + if (width * height != elements) { offset += width * height * data_size; width = elements - width * height; - height = 1; - BEGIN_NV04(push, NV50_3D(RT_ADDRESS_HIGH(0)), 2); - PUSH_DATAh(push, buf->bo->offset + buf->offset + offset); - PUSH_DATA (push, buf->bo->offset + buf->offset + offset); - BEGIN_NV04(push, NV50_3D(RT_HORIZ(0)), 2); - PUSH_DATA (push, NV50_3D_RT_HORIZ_LINEAR | (width * data_size)); - PUSH_DATA (push, height); - BEGIN_NI04(push, NV50_3D(CLEAR_BUFFERS), 1); - PUSH_DATA (push, 0x3c); + nv50_clear_buffer_push(pipe, res, offset, width * data_size, + data, data_size); } - BEGIN_NV04(push, NV50_3D(COND_MODE), 1); - PUSH_DATA (push, nv50->cond_condmode); - - nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence); - nouveau_fence_ref(nv50->screen->base.fence.current, &buf->fence_wr); - nv50->dirty |= NV50_NEW_FRAMEBUFFER | NV50_NEW_SCISSOR; } diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c index 4e43c4e..71726d1 100644 --- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c @@ -357,27 +357,132 @@ nvc0_clear_render_target(struct pipe_context *pipe, } static void -nvc0_clear_buffer_cpu(struct pipe_context *pipe, - struct pipe_resource *res, - unsigned offset, unsigned size, - const void *data, int data_size) +nvc0_clear_buffer_push_nvc0(struct pipe_context *pipe, + struct pipe_resource *res, + unsigned offset, unsigned size, + const void *data, int data_size) { + struct nvc0_context *nvc0 = nvc0_context(pipe); + struct nouveau_pushbuf *push = nvc0->base.pushbuf; struct nv04_resource *buf = nv04_resource(res); - struct pipe_transfer *pt; - struct pipe_box box; - unsigned elements, i; + unsigned i; - elements = size / data_size; + nouveau_bufctx_refn(nvc0->bufctx, 0, buf->bo, buf->domain | NOUVEAU_BO_WR); + nouveau_pushbuf_bufctx(push, nvc0->bufctx); + nouveau_pushbuf_validate(push); - u_box_1d(offset, size, &box); + unsigned count = (size + 3) / 4; + unsigned data_words = data_size / 4; - uint8_t *map = buf->vtbl->transfer_map(pipe, res, 0, PIPE_TRANSFER_WRITE, - &box, &pt); + while (count) { + unsigned nr_data = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN) / data_words; + unsigned nr = nr_data * data_words; - for (i = 0; i < elements; ++i) - memcpy(&map[i*data_size], data, data_size); + if (!PUSH_SPACE(push, nr + 9)) + break; - buf->vtbl->transfer_unmap(pipe, pt); + BEGIN_NVC0(push, NVC0_M2MF(OFFSET_OUT_HIGH), 2); + PUSH_DATAh(push, buf->address + offset); + PUSH_DATA (push, buf->address + offset); + BEGIN_NVC0(push, NVC0_M2MF(LINE_LENGTH_IN), 2); + PUSH_DATA (push, MIN2(size, nr * 4)); + PUSH_DATA (push, 1); + BEGIN_NVC0(push, NVC0_M2MF(EXEC), 1); + PUSH_DATA (push, 0x100111); + + /* must not be interrupted (trap on QUERY fence, 0x50 works however) */ + BEGIN_NIC0(push, NVC0_M2MF(DATA), nr); + for (i = 0; i < nr_data; i++) + PUSH_DATAp(push, data, data_words); + + count -= nr; + offset += nr * 4; + size -= nr * 4; + } + + if (buf->mm) { + nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence); + nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence_wr); + } + + nouveau_bufctx_reset(nvc0->bufctx, 0); +} + +static void +nvc0_clear_buffer_push_nve4(struct pipe_context *pipe, + struct pipe_resource *res, + unsigned offset, unsigned size, + const void *data, int data_size) +{ + struct nvc0_context *nvc0 = nvc0_context(pipe); + struct nouveau_pushbuf *push = nvc0->base.pushbuf; + struct nv04_resource *buf = nv04_resource(res); + unsigned i; + + nouveau_bufctx_refn(nvc0->bufctx, 0, buf->bo, buf->domain | NOUVEAU_BO_WR); + nouveau_pushbuf_bufctx(push, nvc0->bufctx); + nouveau_pushbuf_validate(push); + + unsigned count = (size + 3) / 4; + unsigned data_words = data_size / 4; + + while (count) { + unsigned nr_data = MIN2(count, NV04_PFIFO_MAX_PACKET_LEN) / data_words; + unsigned nr = nr_data * data_words; + + if (!PUSH_SPACE(push, nr + 10)) + break; + + BEGIN_NVC0(push, NVE4_P2MF(UPLOAD_DST_ADDRESS_HIGH), 2); + PUSH_DATAh(push, buf->address + offset); + PUSH_DATA (push, buf->address + offset); + BEGIN_NVC0(push, NVE4_P2MF(UPLOAD_LINE_LENGTH_IN), 2); + PUSH_DATA (push, MIN2(size, nr * 4)); + PUSH_DATA (push, 1); + /* must not be interrupted (trap on QUERY fence, 0x50 works however) */ + BEGIN_1IC0(push, NVE4_P2MF(UPLOAD_EXEC), nr + 1); + PUSH_DATA (push, 0x1001); + for (i = 0; i < nr_data; i++) + PUSH_DATAp(push, data, data_words); + + count -= nr; + offset += nr * 4; + size -= nr * 4; + } + + if (buf->mm) { + nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence); + nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence_wr); + } + + nouveau_bufctx_reset(nvc0->bufctx, 0); +} + +static void +nvc0_clear_buffer_push(struct pipe_context *pipe, + struct pipe_resource *res, + unsigned offset, unsigned size, + const void *data, int data_size) +{ + struct nvc0_context *nvc0 = nvc0_context(pipe); + unsigned tmp; + + if (data_size == 1) { + tmp = *(unsigned char *)data; + tmp = (tmp << 24) | (tmp << 16) | (tmp << 8) | tmp; + data = &tmp; + data_size = 4; + } else if (data_size == 2) { + tmp = *(unsigned short *)data; + tmp = (tmp << 16) | tmp; + data = &tmp; + data_size = 4; + } + + if (nvc0->screen->base.class_3d < NVE4_3D_CLASS) + nvc0_clear_buffer_push_nvc0(pipe, res, offset, size, data, data_size); + else + nvc0_clear_buffer_push_nve4(pipe, res, offset, size, data, data_size); } static void @@ -402,10 +507,8 @@ nvc0_clear_buffer(struct pipe_context *pipe, memcpy(&color.ui, data, 16); break; case 12: - /* This doesn't work, RGB32 is not a valid RT format. - * dst_fmt = PIPE_FORMAT_R32G32B32_UINT; - * memcpy(&color.ui, data, 12); - * memset(&color.ui[3], 0, 4); + /* RGB32 is not a valid RT format. This will be handled by the pushbuf + * uploader. */ break; case 8: @@ -437,14 +540,26 @@ nvc0_clear_buffer(struct pipe_context *pipe, assert(size % data_size == 0); if (data_size == 12) { - /* TODO: Find a way to do this with the GPU! */ - nvc0_clear_buffer_cpu(pipe, res, offset, size, data, data_size); + nvc0_clear_buffer_push(pipe, res, offset, size, data, data_size); return; } + if (offset & 0xff) { + unsigned fixup_size = MIN2(size, align(offset, 0x100) - offset); + assert(fixup_size % data_size == 0); + nvc0_clear_buffer_push(pipe, res, offset, fixup_size, data, data_size); + offset += fixup_size; + size -= fixup_size; + if (!size) + return; + } + elements = size / data_size; height = (elements + 16383) / 16384; width = elements / height; + if (height > 1) + width &= ~0xff; + assert(width > 0); if (!PUSH_SPACE(push, 40)) return; @@ -465,7 +580,7 @@ nvc0_clear_buffer(struct pipe_context *pipe, BEGIN_NVC0(push, NVC0_3D(RT_ADDRESS_HIGH(0)), 9); PUSH_DATAh(push, buf->address + offset); PUSH_DATA (push, buf->address + offset); - PUSH_DATA (push, width * data_size); + PUSH_DATA (push, align(width * data_size, 0x100)); PUSH_DATA (push, height); PUSH_DATA (push, nvc0_format_table[dst_fmt].rt); PUSH_DATA (push, NVC0_3D_RT_TILE_MODE_LINEAR); @@ -480,24 +595,20 @@ nvc0_clear_buffer(struct pipe_context *pipe, IMMED_NVC0(push, NVC0_3D(CLEAR_BUFFERS), 0x3c); + IMMED_NVC0(push, NVC0_3D(COND_MODE), nvc0->cond_condmode); + + if (buf->mm) { + nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence); + nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence_wr); + } + if (width * height != elements) { offset += width * height * data_size; width = elements - width * height; - height = 1; - - BEGIN_NVC0(push, NVC0_3D(RT_ADDRESS_HIGH(0)), 4); - PUSH_DATAh(push, buf->address + offset); - PUSH_DATA (push, buf->address + offset); - PUSH_DATA (push, width * data_size); - PUSH_DATA (push, height); - - IMMED_NVC0(push, NVC0_3D(CLEAR_BUFFERS), 0x3c); + nvc0_clear_buffer_push(pipe, res, offset, width * data_size, + data, data_size); } - IMMED_NVC0(push, NVC0_3D(COND_MODE), nvc0->cond_condmode); - - nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence); - nouveau_fence_ref(nvc0->screen->base.fence.current, &buf->fence_wr); nvc0->dirty |= NVC0_NEW_FRAMEBUFFER; } -- 2.7.4