drm/vc4: Add fragment shader threading support

author Jonas Pfeil <pfeiljonas@gmx.de>

Mon, 7 Nov 2016 23:18:39 +0000 (00:18 +0100)

committer Eric Anholt <eric@anholt.net>

Wed, 16 Nov 2016 21:25:26 +0000 (13:25 -0800)
author Jonas Pfeil <pfeiljonas@gmx.de>
Mon, 7 Nov 2016 23:18:39 +0000 (00:18 +0100)
committer Eric Anholt <eric@anholt.net>
Wed, 16 Nov 2016 21:25:26 +0000 (13:25 -0800)
diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c

index 7abfe08..86aabf6 100644 (file)
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -82,6 +82,7 @@ static int vc4_get_param_ioctl(struct drm_device *dev, void *data,
                 break;
         case DRM_VC4_PARAM_SUPPORTS_BRANCHES:
         case DRM_VC4_PARAM_SUPPORTS_ETC1:
+       case DRM_VC4_PARAM_SUPPORTS_THREADED_FS:
                 args->value = true;
                 break;
         default:
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h

index 7c1e4d9..fef1728 100644 (file)
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -381,6 +381,8 @@ struct vc4_validated_shader_info {
  
         uint32_t num_uniform_addr_offsets;
         uint32_t *uniform_addr_offsets;
+
+       bool is_threaded;
  };
  
  /**
diff --git a/drivers/gpu/drm/vc4/vc4_validate.c b/drivers/gpu/drm/vc4/vc4_validate.c

index e18f882..9fd171c 100644 (file)
--- a/drivers/gpu/drm/vc4/vc4_validate.c
+++ b/drivers/gpu/drm/vc4/vc4_validate.c
@@ -789,11 +789,6 @@ validate_gl_shader_rec(struct drm_device *dev,
         exec->shader_rec_v += roundup(packet_size, 16);
         exec->shader_rec_size -= packet_size;
  
-       if (!(*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD)) {
-               DRM_ERROR("Multi-threaded fragment shaders not supported.\n");
-               return -EINVAL;
-       }
-
         for (i = 0; i < shader_reloc_count; i++) {
                 if (src_handles[i] > exec->bo_count) {
                         DRM_ERROR("Shader handle %d too big\n", src_handles[i]);
@@ -810,6 +805,18 @@ validate_gl_shader_rec(struct drm_device *dev,
                         return -EINVAL;
         }
  
+       if (((*(uint16_t *)pkt_u & VC4_SHADER_FLAG_FS_SINGLE_THREAD) == 0) !=
+           to_vc4_bo(&bo[0]->base)->validated_shader->is_threaded) {
+               DRM_ERROR("Thread mode of CL and FS do not match\n");
+               return -EINVAL;
+       }
+
+       if (to_vc4_bo(&bo[1]->base)->validated_shader->is_threaded ||
+           to_vc4_bo(&bo[2]->base)->validated_shader->is_threaded) {
+               DRM_ERROR("cs and vs cannot be threaded\n");
+               return -EINVAL;
+       }
+
         for (i = 0; i < shader_reloc_count; i++) {
                 struct vc4_validated_shader_info *validated_shader;
                 uint32_t o = shader_reloc_offsets[i];
diff --git a/drivers/gpu/drm/vc4/vc4_validate_shaders.c b/drivers/gpu/drm/vc4/vc4_validate_shaders.c

index 917321c..5dba13d 100644 (file)
--- a/drivers/gpu/drm/vc4/vc4_validate_shaders.c
+++ b/drivers/gpu/drm/vc4/vc4_validate_shaders.c
@@ -83,6 +83,13 @@ struct vc4_shader_validation_state {
          * basic blocks.
          */
         bool needs_uniform_address_for_loop;
+
+       /* Set when we find an instruction writing the top half of the
+        * register files.  If we allowed writing the unusable regs in
+        * a threaded shader, then the other shader running on our
+        * QPU's clamp validation would be invalid.
+        */
+       bool all_registers_used;
  };
  
  static uint32_t
@@ -119,6 +126,13 @@ raddr_add_a_to_live_reg_index(uint64_t inst)
  }
  
  static bool
+live_reg_is_upper_half(uint32_t lri)
+{
+       return  (lri >= 16 && lri < 32) ||
+               (lri >= 32 + 16 && lri < 32 + 32);
+}
+
+static bool
  is_tmu_submit(uint32_t waddr)
  {
         return (waddr == QPU_W_TMU0_S ||
@@ -390,6 +404,9 @@ check_reg_write(struct vc4_validated_shader_info *validated_shader,
                 } else {
                         validation_state->live_immediates[lri] = ~0;
                 }
+
+               if (live_reg_is_upper_half(lri))
+                       validation_state->all_registers_used = true;
         }
  
         switch (waddr) {
@@ -598,6 +615,11 @@ check_instruction_reads(struct vc4_validated_shader_info *validated_shader,
                 }
         }
  
+       if ((raddr_a >= 16 && raddr_a < 32) ||
+           (raddr_b >= 16 && raddr_b < 32 && sig != QPU_SIG_SMALL_IMM)) {
+               validation_state->all_registers_used = true;
+       }
+
         return true;
  }
  
@@ -753,6 +775,7 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
  {
         bool found_shader_end = false;
         int shader_end_ip = 0;
+       uint32_t last_thread_switch_ip = -3;
         uint32_t ip;
         struct vc4_validated_shader_info *validated_shader = NULL;
         struct vc4_shader_validation_state validation_state;
@@ -785,6 +808,17 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
                 if (!vc4_handle_branch_target(&validation_state))
                         goto fail;
  
+               if (ip == last_thread_switch_ip + 3) {
+                       /* Reset r0-r3 live clamp data */
+                       int i;
+
+                       for (i = 64; i < LIVE_REG_COUNT; i++) {
+                               validation_state.live_min_clamp_offsets[i] = ~0;
+                               validation_state.live_max_clamp_regs[i] = false;
+                               validation_state.live_immediates[i] = ~0;
+                       }
+               }
+
                 switch (sig) {
                 case QPU_SIG_NONE:
                 case QPU_SIG_WAIT_FOR_SCOREBOARD:
@@ -794,6 +828,8 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
                 case QPU_SIG_LOAD_TMU1:
                 case QPU_SIG_PROG_END:
                 case QPU_SIG_SMALL_IMM:
+               case QPU_SIG_THREAD_SWITCH:
+               case QPU_SIG_LAST_THREAD_SWITCH:
                         if (!check_instruction_writes(validated_shader,
                                                       &validation_state)) {
                                 DRM_ERROR("Bad write at ip %d\n", ip);
@@ -809,6 +845,18 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
                                 shader_end_ip = ip;
                         }
  
+                       if (sig == QPU_SIG_THREAD_SWITCH ||
+                           sig == QPU_SIG_LAST_THREAD_SWITCH) {
+                               validated_shader->is_threaded = true;
+
+                               if (ip < last_thread_switch_ip + 3) {
+                                       DRM_ERROR("Thread switch too soon after "
+                                                 "last switch at ip %d\n", ip);
+                                       goto fail;
+                               }
+                               last_thread_switch_ip = ip;
+                       }
+
                         break;
  
                 case QPU_SIG_LOAD_IMM:
@@ -823,6 +871,13 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
                         if (!check_branch(inst, validated_shader,
                                           &validation_state, ip))
                                 goto fail;
+
+                       if (ip < last_thread_switch_ip + 3) {
+                               DRM_ERROR("Branch in thread switch at ip %d",
+                                         ip);
+                               goto fail;
+                       }
+
                         break;
                 default:
                         DRM_ERROR("Unsupported QPU signal %d at "
@@ -844,6 +899,14 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
                 goto fail;
         }
  
+       /* Might corrupt other thread */
+       if (validated_shader->is_threaded &&
+           validation_state.all_registers_used) {
+               DRM_ERROR("Shader uses threading, but uses the upper "
+                         "half of the registers, too\n");
+               goto fail;
+       }
+
         /* If we did a backwards branch and we haven't emitted a uniforms
          * reset since then, we still need the uniforms stream to have the
          * uniforms address available so that the backwards branch can do its
diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h

index 69caa21..f07a090 100644 (file)
--- a/include/uapi/drm/vc4_drm.h
+++ b/include/uapi/drm/vc4_drm.h
@@ -287,6 +287,7 @@ struct drm_vc4_get_hang_state {
  #define DRM_VC4_PARAM_V3D_IDENT2               2
  #define DRM_VC4_PARAM_SUPPORTS_BRANCHES                3
  #define DRM_VC4_PARAM_SUPPORTS_ETC1            4
+#define DRM_VC4_PARAM_SUPPORTS_THREADED_FS     5
  
  struct drm_vc4_get_param {
         __u32 param;
author	Jonas Pfeil <pfeiljonas@gmx.de>
	Mon, 7 Nov 2016 23:18:39 +0000 (00:18 +0100)
committer	Eric Anholt <eric@anholt.net>
	Wed, 16 Nov 2016 21:25:26 +0000 (13:25 -0800)
drivers/gpu/drm/vc4/vc4_drv.c		patch \| blob \| history
drivers/gpu/drm/vc4/vc4_drv.h		patch \| blob \| history
drivers/gpu/drm/vc4/vc4_validate.c		patch \| blob \| history
drivers/gpu/drm/vc4/vc4_validate_shaders.c		patch \| blob \| history
include/uapi/drm/vc4_drm.h		patch \| blob \| history