intel/vec4: Add support for masking pushed data
authorJason Ekstrand <jason@jlekstrand.net>
Sun, 2 May 2021 18:54:01 +0000 (13:54 -0500)
committerMarge Bot <eric+marge@anholt.net>
Wed, 19 May 2021 14:38:13 +0000 (14:38 +0000)
This is the vec4 equivalent of d0d039a4d3f4, required for proper UBO
pushing in vertex stages for Vulkan on HSW.  Sadly, the implementation
requires us to do everything in ALIGN1 mode and the vec4 instruction
scheduler doesn't understand HW_GRF <-> UNIFORM interference so it's
easier to do the whole thing in the generator.  We add an instruction
to the top of the program which just means "emit the blob" and all the
magic happens in codegen.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10571>

src/intel/compiler/brw_eu_defines.h
src/intel/compiler/brw_ir_performance.cpp
src/intel/compiler/brw_shader.cpp
src/intel/compiler/brw_vec4.cpp
src/intel/compiler/brw_vec4_generator.cpp

index 3fccd18..974efce 100644 (file)
@@ -574,6 +574,7 @@ enum opcode {
    VEC4_OPCODE_SET_LOW_32BIT,
    VEC4_OPCODE_SET_HIGH_32BIT,
    VEC4_OPCODE_MOV_FOR_SCRATCH,
+   VEC4_OPCODE_ZERO_OOB_PUSH_REGS,
 
    FS_OPCODE_DDX_COARSE,
    FS_OPCODE_DDX_FINE,
index fa0aca6..a9f9360 100644 (file)
@@ -344,6 +344,7 @@ namespace {
       case VEC4_OPCODE_PICK_HIGH_32BIT:
       case VEC4_OPCODE_SET_LOW_32BIT:
       case VEC4_OPCODE_SET_HIGH_32BIT:
+      case VEC4_OPCODE_ZERO_OOB_PUSH_REGS:
       case GS_OPCODE_SET_DWORD_2:
       case GS_OPCODE_SET_WRITE_OFFSET:
       case GS_OPCODE_SET_VERTEX_COUNT:
index bc50595..4aeff34 100644 (file)
@@ -423,6 +423,8 @@ brw_instruction_name(const struct intel_device_info *devinfo, enum opcode op)
       return "set_high_32bit";
    case VEC4_OPCODE_MOV_FOR_SCRATCH:
       return "mov_for_scratch";
+   case VEC4_OPCODE_ZERO_OOB_PUSH_REGS:
+      return "zero_oob_push_regs";
 
    case FS_OPCODE_DDX_COARSE:
       return "ddx_coarse";
@@ -1136,6 +1138,7 @@ backend_instruction::has_side_effects() const
    case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
    case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
    case RT_OPCODE_TRACE_RAY_LOGICAL:
+   case VEC4_OPCODE_ZERO_OOB_PUSH_REGS:
       return true;
    default:
       return eot;
index 062167e..4e215c8 100644 (file)
@@ -2698,6 +2698,20 @@ vec4_visitor::run()
 
    setup_push_ranges();
 
+   if (prog_data->base.zero_push_reg) {
+      /* push_reg_mask_param is in uint32 params and UNIFORM is in vec4s */
+      const unsigned mask_param = stage_prog_data->push_reg_mask_param;
+      src_reg mask = src_reg(dst_reg(UNIFORM, mask_param / 4));
+      assert(mask_param % 2 == 0); /* Should be 64-bit-aligned */
+      mask.swizzle = BRW_SWIZZLE4((mask_param + 0) % 4,
+                                  (mask_param + 1) % 4,
+                                  (mask_param + 0) % 4,
+                                  (mask_param + 1) % 4);
+
+      emit(VEC4_OPCODE_ZERO_OOB_PUSH_REGS,
+           dst_reg(VGRF, alloc.allocate(3)), mask);
+   }
+
    emit_prolog();
 
    emit_nir_code();
index 1039bf2..1379c99 100644 (file)
@@ -1465,6 +1465,57 @@ generate_mov_indirect(struct brw_codegen *p,
 }
 
 static void
+generate_zero_oob_push_regs(struct brw_codegen *p,
+                            struct brw_stage_prog_data *prog_data,
+                            struct brw_reg scratch,
+                            struct brw_reg bit_mask_in)
+{
+   const uint64_t want_zero = prog_data->zero_push_reg;
+   assert(want_zero);
+
+   assert(bit_mask_in.file == BRW_GENERAL_REGISTER_FILE);
+   assert(BRW_GET_SWZ(bit_mask_in.swizzle, 1) ==
+          BRW_GET_SWZ(bit_mask_in.swizzle, 0) + 1);
+   bit_mask_in.subnr += BRW_GET_SWZ(bit_mask_in.swizzle, 0) * 4;
+   bit_mask_in.type = BRW_REGISTER_TYPE_W;
+
+   /* Scratch should be 3 registers in the GRF */
+   assert(scratch.file == BRW_GENERAL_REGISTER_FILE);
+   scratch = vec8(scratch);
+   struct brw_reg mask_w16 = retype(scratch, BRW_REGISTER_TYPE_W);
+   struct brw_reg mask_d16 = retype(byte_offset(scratch, REG_SIZE),
+                                    BRW_REGISTER_TYPE_D);
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+   for (unsigned i = 0; i < 64; i++) {
+      if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
+         brw_set_default_exec_size(p, BRW_EXECUTE_8);
+         brw_SHL(p, suboffset(mask_w16, 8),
+                    vec1(byte_offset(bit_mask_in, i / 8)),
+                    brw_imm_v(0x01234567));
+         brw_SHL(p, mask_w16, suboffset(mask_w16, 8), brw_imm_w(8));
+
+         brw_set_default_exec_size(p, BRW_EXECUTE_16);
+         brw_ASR(p, mask_d16, mask_w16, brw_imm_w(15));
+      }
+
+      if (want_zero & BITFIELD64_BIT(i)) {
+         unsigned push_start = prog_data->dispatch_grf_start_reg;
+         struct brw_reg push_reg =
+            retype(brw_vec8_grf(push_start + i, 0), BRW_REGISTER_TYPE_D);
+
+         brw_set_default_exec_size(p, BRW_EXECUTE_8);
+         brw_AND(p, push_reg, push_reg, vec1(suboffset(mask_d16, i)));
+      }
+   }
+
+   brw_pop_insn_state(p);
+}
+
+static void
 generate_code(struct brw_codegen *p,
               const struct brw_compiler *compiler,
               void *log_data,
@@ -2065,6 +2116,9 @@ generate_code(struct brw_codegen *p,
          break;
       }
 
+      case VEC4_OPCODE_ZERO_OOB_PUSH_REGS:
+         generate_zero_oob_push_regs(p, &prog_data->base, dst, src[0]);
+
       case TCS_OPCODE_URB_WRITE:
          generate_tcs_urb_write(p, inst, src[0]);
          send_count++;