From c5c12bdd004994abfe0f5723e2d285cc69706b1a Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Tue, 7 May 2019 23:19:30 -0700 Subject: [PATCH] iris: Try to recover from GPU hangs. The iris batch module now tries to detect that the kernel has banned our GEM context, creates a new non-banned context, and informs the iris context module that all assumptions about state are now invalid and it needs to reinitialize the relevant state. Based on Chris Wilson's work, but significantly rewritten by me. --- src/gallium/drivers/iris/iris_batch.c | 31 +++++++++++++++++++++++++++ src/gallium/drivers/iris/iris_context.c | 38 +++++++++++++++++++++++++++++++++ src/gallium/drivers/iris/iris_context.h | 2 ++ 3 files changed, 71 insertions(+) diff --git a/src/gallium/drivers/iris/iris_batch.c b/src/gallium/drivers/iris/iris_batch.c index d2b4fc8..f3d2e56 100644 --- a/src/gallium/drivers/iris/iris_batch.c +++ b/src/gallium/drivers/iris/iris_batch.c @@ -452,6 +452,28 @@ iris_finish_batch(struct iris_batch *batch) } /** + * Replace our current GEM context with a new one (in case it got banned). + */ +static bool +replace_hw_ctx(struct iris_batch *batch) +{ + struct iris_screen *screen = batch->screen; + struct iris_bufmgr *bufmgr = screen->bufmgr; + + uint32_t new_ctx = iris_clone_hw_context(bufmgr, batch->hw_ctx_id); + if (!new_ctx) + return false; + + iris_destroy_hw_context(bufmgr, batch->hw_ctx_id); + batch->hw_ctx_id = new_ctx; + + /* Notify the context that state must be re-initialized. */ + iris_lost_context_state(batch); + + return true; +} + +/** * Submit the batch to the GPU via execbuffer2. */ static int @@ -583,6 +605,15 @@ _iris_batch_flush(struct iris_batch *batch, const char *file, int line) /* Start a new batch buffer. */ iris_batch_reset(batch); + /* EIO means our context is banned. In this case, try and replace it + * with a new logical context, and inform iris_context that all state + * has been lost and needs to be re-initialized. If this succeeds, + * dubiously claim success... + */ + if (ret == -EIO && replace_hw_ctx(batch)) { + ret = 0; + } + if (ret >= 0) { //if (iris->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB) //iris_check_for_reset(ice); diff --git a/src/gallium/drivers/iris/iris_context.c b/src/gallium/drivers/iris/iris_context.c index a1d1175..7ed4fdc 100644 --- a/src/gallium/drivers/iris/iris_context.c +++ b/src/gallium/drivers/iris/iris_context.c @@ -63,6 +63,44 @@ iris_set_debug_callback(struct pipe_context *ctx, memset(&ice->dbg, 0, sizeof(ice->dbg)); } +/** + * Called from the batch module when it detects a GPU hang. + * + * In this case, we've lost our GEM context, and can't rely on any existing + * state on the GPU. We must mark everything dirty and wipe away any saved + * assumptions about the last known state of the GPU. + */ +void +iris_lost_context_state(struct iris_batch *batch) +{ + /* The batch module doesn't have an iris_context, because we want to + * avoid introducing lots of layering violations. Unfortunately, here + * we do need to inform the context of batch catastrophe. We know the + * batch is one of our context's, so hackily claw our way back. + */ + struct iris_context *ice = NULL; + struct iris_screen *screen; + + if (batch->name == IRIS_BATCH_RENDER) { + ice = container_of(batch, ice, batches[IRIS_BATCH_RENDER]); + assert(&ice->batches[IRIS_BATCH_RENDER] == batch); + screen = (void *) ice->ctx.screen; + + ice->vtbl.init_render_context(screen, batch, &ice->vtbl, &ice->dbg); + } else if (batch->name == IRIS_BATCH_COMPUTE) { + ice = container_of(batch, ice, batches[IRIS_BATCH_COMPUTE]); + assert(&ice->batches[IRIS_BATCH_COMPUTE] == batch); + screen = (void *) ice->ctx.screen; + + ice->vtbl.init_compute_context(screen, batch, &ice->vtbl, &ice->dbg); + } else { + unreachable("unhandled batch reset"); + } + + ice->state.dirty = ~0ull; + memset(ice->state.last_grid, 0, sizeof(ice->state.last_grid)); +} + static void iris_get_sample_position(struct pipe_context *ctx, unsigned sample_count, diff --git a/src/gallium/drivers/iris/iris_context.h b/src/gallium/drivers/iris/iris_context.h index 31f345d..4501c4f 100644 --- a/src/gallium/drivers/iris/iris_context.h +++ b/src/gallium/drivers/iris/iris_context.h @@ -662,6 +662,8 @@ double get_time(void); struct pipe_context * iris_create_context(struct pipe_screen *screen, void *priv, unsigned flags); +void iris_lost_context_state(struct iris_batch *batch); + void iris_init_blit_functions(struct pipe_context *ctx); void iris_init_clear_functions(struct pipe_context *ctx); void iris_init_program_functions(struct pipe_context *ctx); -- 2.7.4