From b7979a849bc185fbcab93a841eed692a10d61e25 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Mon, 24 Oct 2016 14:57:21 -0700
Subject: [PATCH] i965/blit: Break blits into chunks in set_alpha_to_one

v2: Properly handle linear blit alignment restrictions

Signed-off-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
---
 src/mesa/drivers/dri/i965/intel_blit.c | 88 ++++++++++++++++++++++++++++------
 1 file changed, 73 insertions(+), 15 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_blit.c b/src/mesa/drivers/dri/i965/intel_blit.c
index bc97e66..05a78d7 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.c
+++ b/src/mesa/drivers/dri/i965/intel_blit.c
@@ -179,6 +179,42 @@ intel_miptree_blit_compatible_formats(mesa_format src, mesa_format dst)
    return false;
 }
 
+static void
+get_blit_intratile_offset_el(const struct brw_context *brw,
+                             struct intel_mipmap_tree *mt,
+                             uint32_t total_x_offset_el,
+                             uint32_t total_y_offset_el,
+                             uint32_t *base_address_offset,
+                             uint32_t *x_offset_el,
+                             uint32_t *y_offset_el)
+{
+   enum isl_tiling tiling = intel_miptree_get_isl_tiling(mt);
+   isl_tiling_get_intratile_offset_el(&brw->isl_dev,
+                                      tiling, mt->cpp, mt->pitch,
+                                      total_x_offset_el, total_y_offset_el,
+                                      base_address_offset,
+                                      x_offset_el, y_offset_el);
+   if (tiling == ISL_TILING_LINEAR) {
+      /* From the Broadwell PRM docs for XY_SRC_COPY_BLT::SourceBaseAddress:
+       *
+       *    "Base address of the destination surface: X=0, Y=0. Lower 32bits
+       *    of the 48bit addressing. When Src Tiling is enabled (Bit_15
+       *    enabled), this address must be 4KB-aligned. When Tiling is not
+       *    enabled, this address should be CL (64byte) aligned."
+       *
+       * The offsets we get from ISL in the tiled case are already aligned.
+       * In the linear case, we need to do some of our own aligning.
+       */
+      assert(mt->pitch % 64 == 0);
+      uint32_t delta = *base_address_offset & 63;
+      assert(delta % mt->cpp == 0);
+      *base_address_offset -= delta;
+      *x_offset_el += delta / mt->cpp;
+   } else {
+      assert(*base_address_offset % 4096 == 0);
+   }
+}
+
 /**
  * Implements a rectangular block transfer (blit) of pixels between two
  * miptrees.
@@ -804,22 +840,44 @@ intel_miptree_set_alpha_to_one(struct brw_context *brw,
    unsigned length = brw->gen >= 8 ? 7 : 6;
    bool dst_y_tiled = mt->tiling == I915_TILING_Y;
 
-   BEGIN_BATCH_BLT_TILED(length, dst_y_tiled, false);
-   OUT_BATCH(CMD | (length - 2));
-   OUT_BATCH(BR13);
-   OUT_BATCH(SET_FIELD(y, BLT_Y) | SET_FIELD(x, BLT_X));
-   OUT_BATCH(SET_FIELD(y + height, BLT_Y) | SET_FIELD(x + width, BLT_X));
-   if (brw->gen >= 8) {
-      OUT_RELOC64(mt->bo,
-                  I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                  0);
-   } else {
-      OUT_RELOC(mt->bo,
-                I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-                0);
+   /* We need to split the blit into chunks that each fit within the blitter's
+    * restrictions.  We can't use a chunk size of 32768 because we need to
+    * ensure that src_tile_x + chunk_size fits.  We choose 16384 because it's
+    * a nice round power of two, big enough that performance won't suffer, and
+    * small enough to guarantee everything fits.
+    */
+   const uint32_t max_chunk_size = 16384;
+
+   for (uint32_t chunk_x = 0; chunk_x < width; chunk_x += max_chunk_size) {
+      for (uint32_t chunk_y = 0; chunk_y < height; chunk_y += max_chunk_size) {
+         const uint32_t chunk_w = MIN2(max_chunk_size, width - chunk_x);
+         const uint32_t chunk_h = MIN2(max_chunk_size, height - chunk_y);
+
+         uint32_t offset, tile_x, tile_y;
+         get_blit_intratile_offset_el(brw, mt,
+                                      x + chunk_x, y + chunk_y,
+                                      &offset, &tile_x, &tile_y);
+
+         BEGIN_BATCH_BLT_TILED(length, dst_y_tiled, false);
+         OUT_BATCH(CMD | (length - 2));
+         OUT_BATCH(BR13);
+         OUT_BATCH(SET_FIELD(y + chunk_y, BLT_Y) |
+                   SET_FIELD(x + chunk_x, BLT_X));
+         OUT_BATCH(SET_FIELD(y + chunk_y + chunk_h, BLT_Y) |
+                   SET_FIELD(x + chunk_x + chunk_w, BLT_X));
+         if (brw->gen >= 8) {
+            OUT_RELOC64(mt->bo,
+                        I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
+                        offset);
+         } else {
+            OUT_RELOC(mt->bo,
+                      I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
+                      offset);
+         }
+         OUT_BATCH(0xffffffff); /* white, but only alpha gets written */
+         ADVANCE_BATCH_TILED(dst_y_tiled, false);
+      }
    }
-   OUT_BATCH(0xffffffff); /* white, but only alpha gets written */
-   ADVANCE_BATCH_TILED(dst_y_tiled, false);
 
    brw_emit_mi_flush(brw);
 }
-- 
2.7.4