From 7d33f09a780c696a1ce30d70fce3721a80d5094a Mon Sep 17 00:00:00 2001 From: Yang Rong Date: Mon, 29 Sep 2014 13:38:12 +0800 Subject: [PATCH] BDW: add some BDW function. Add intel_gpgpu_load_vfe_state_gen8, intel_gpgpu_walker_gen8, intel_gpgpu_build_idrt_gen8. Reloc Dynamic State Base Address in gen7's intel_gpgpu_set_base_address, to unify intel_gpgpu_load_curbe_buffer and intel_gpgpu_load_idrt. Now can pass part of utest builtin_global_id. Signed-off-by: Yang Rong Reviewed-by: Zhigang Gong Reviewed-by: Junyan He --- src/intel/intel_gpgpu.c | 176 ++++++++++++++++++++++++++++++++++++++++------ src/intel/intel_structs.h | 104 +++++++++++++++++++-------- 2 files changed, 227 insertions(+), 53 deletions(-) diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index 6d3c443..b4d0846 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -143,6 +143,13 @@ typedef void (intel_gpgpu_setup_bti_t)(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t size, unsigned char index, uint32_t format); intel_gpgpu_setup_bti_t *intel_gpgpu_setup_bti = NULL; + +typedef void (intel_gpgpu_load_vfe_state_t)(intel_gpgpu_t *gpgpu); +intel_gpgpu_load_vfe_state_t *intel_gpgpu_load_vfe_state = NULL; + +typedef void (intel_gpgpu_build_idrt_t)(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel); +intel_gpgpu_build_idrt_t *intel_gpgpu_build_idrt = NULL; + static void intel_gpgpu_sync(void *buf) { @@ -213,7 +220,7 @@ static void intel_gpgpu_select_pipeline(intel_gpgpu_t *gpgpu) { BEGIN_BATCH(gpgpu->batch, 1); - OUT_BATCH(gpgpu->batch, CMD_PIPELINE_SELECT | PIPELINE_SELECT_MEDIA); + OUT_BATCH(gpgpu->batch, CMD_PIPELINE_SELECT | PIPELINE_SELECT_GPGPU); ADVANCE_BATCH(gpgpu->batch); } @@ -252,7 +259,12 @@ intel_gpgpu_set_base_address_gen7(intel_gpgpu_t *gpgpu) I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION, gpgpu->aux_offset.surface_heap_offset + (0 | (def_cc << 8) | (def_cc << 4) | (0 << 3)| BASE_ADDRESS_MODIFY)); - OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Dynamic State Base Addr */ + + OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo, + I915_GEM_DOMAIN_RENDER, + I915_GEM_DOMAIN_RENDER, + 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Dynamic State Base Addr */ + OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Indirect Obj Base Addr */ OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr */ /* If we output an AUB file, we limit the total size to 64MB */ @@ -364,7 +376,7 @@ intel_gpgpu_get_curbe_size(intel_gpgpu_t *gpgpu) } static void -intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu) +intel_gpgpu_load_vfe_state_gen7(intel_gpgpu_t *gpgpu) { int32_t scratch_index; BEGIN_BATCH(gpgpu->batch, 8); @@ -392,13 +404,43 @@ intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu) } static void +intel_gpgpu_load_vfe_state_gen8(intel_gpgpu_t *gpgpu) +{ + int32_t scratch_index; + BEGIN_BATCH(gpgpu->batch, 9); + OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_POINTERS | (9-2)); + + if(gpgpu->per_thread_scratch > 0) { + scratch_index = intel_gpgpu_get_scratch_index(gpgpu->per_thread_scratch); + OUT_RELOC(gpgpu->batch, gpgpu->scratch_b.bo, + I915_GEM_DOMAIN_RENDER, + I915_GEM_DOMAIN_RENDER, + scratch_index); + } + else { + OUT_BATCH(gpgpu->batch, 0); + } + OUT_BATCH(gpgpu->batch, 0); + + /* max_thread | urb entries | (reset_gateway|bypass_gate_way | gpgpu_mode) */ + OUT_BATCH(gpgpu->batch, 0 | ((gpgpu->max_threads - 1) << 16) | (2 << 8) | 0xc0); //urb entries can't be 0 + OUT_BATCH(gpgpu->batch, 0); + /* urb entries size | curbe_size */ + OUT_BATCH(gpgpu->batch, 2<<16 | intel_gpgpu_get_curbe_size(gpgpu)); + OUT_BATCH(gpgpu->batch, 0); + OUT_BATCH(gpgpu->batch, 0); + OUT_BATCH(gpgpu->batch, 0); + ADVANCE_BATCH(gpgpu->batch); +} + +static void intel_gpgpu_load_curbe_buffer(intel_gpgpu_t *gpgpu) { BEGIN_BATCH(gpgpu->batch, 4); OUT_BATCH(gpgpu->batch, CMD(2,0,1) | (4 - 2)); /* length-2 */ OUT_BATCH(gpgpu->batch, 0); /* mbz */ OUT_BATCH(gpgpu->batch, intel_gpgpu_get_curbe_size(gpgpu) * 32); - OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, gpgpu->aux_offset.curbe_offset); + OUT_BATCH(gpgpu->batch, gpgpu->aux_offset.curbe_offset); ADVANCE_BATCH(gpgpu->batch); } @@ -409,7 +451,7 @@ intel_gpgpu_load_idrt(intel_gpgpu_t *gpgpu) OUT_BATCH(gpgpu->batch, CMD(2,0,2) | (4 - 2)); /* length-2 */ OUT_BATCH(gpgpu->batch, 0); /* mbz */ OUT_BATCH(gpgpu->batch, 1 << 5); - OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, gpgpu->aux_offset.idrt_offset); + OUT_BATCH(gpgpu->batch, gpgpu->aux_offset.idrt_offset); ADVANCE_BATCH(gpgpu->batch); } @@ -810,15 +852,15 @@ intel_gpgpu_setup_bti_gen8(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t int ss0->ss3.depth = (s >> 21) & 0x3ff; /* bits 30:21 of sz */ ss0->ss1.mem_obj_ctrl_state = cl_gpgpu_get_cache_ctrl(); heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * sizeof(gen8_surface_state_t); - ss0->ss8_9.surface_base_addr_lo = (buf->offset64 + internal_offset) & 0xffffffff; - ss0->ss8_9.surface_base_addr_hi = ((buf->offset64 + internal_offset) >> 32) & 0xffffffff; + ss0->ss8.surface_base_addr_lo = (buf->offset64 + internal_offset) & 0xffffffff; + ss0->ss9.surface_base_addr_hi = ((buf->offset64 + internal_offset) >> 32) & 0xffffffff; dri_bo_emit_reloc(gpgpu->aux_buf.bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, internal_offset, gpgpu->aux_offset.surface_heap_offset + heap->binding_table[index] + - offsetof(gen8_surface_state_t, ss1), + offsetof(gen8_surface_state_t, ss8), buf); } @@ -1010,7 +1052,7 @@ intel_gpgpu_set_stack(intel_gpgpu_t *gpgpu, uint32_t offset, uint32_t size, uint } static void -intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel) +intel_gpgpu_build_idrt_gen7(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel) { gen6_interface_descriptor_t *desc; drm_intel_bo *ker_bo = NULL; @@ -1025,7 +1067,7 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel) desc->desc5.rounding_mode = 0; /* round to nearest even */ assert((gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_state_offset) % 32 == 0); - desc->desc2.sampler_state_pointer = (gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_state_offset) >> 5; + desc->desc2.sampler_state_pointer = gpgpu->aux_offset.sampler_state_offset >> 5; desc->desc3.binding_table_entry_count = 0; /* no prefetch */ desc->desc3.binding_table_pointer = 0; desc->desc4.curbe_read_len = kernel->curbe_sz / 32; @@ -1057,12 +1099,48 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel) 0, gpgpu->aux_offset.idrt_offset + offsetof(gen6_interface_descriptor_t, desc0), ker_bo); +} - dri_bo_emit_reloc(gpgpu->aux_buf.bo, - I915_GEM_DOMAIN_SAMPLER, 0, - gpgpu->aux_offset.sampler_state_offset, - gpgpu->aux_offset.idrt_offset + offsetof(gen6_interface_descriptor_t, desc2), - gpgpu->aux_buf.bo); +static void +intel_gpgpu_build_idrt_gen8(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel) +{ + gen8_interface_descriptor_t *desc; + + desc = (gen8_interface_descriptor_t*) (gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.idrt_offset); + + memset(desc, 0, sizeof(*desc)); + desc->desc0.kernel_start_pointer = 0; /* reloc */ + desc->desc2.single_program_flow = 0; + desc->desc2.floating_point_mode = 0; /* use IEEE-754 rule */ + desc->desc6.rounding_mode = 0; /* round to nearest even */ + + assert((gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_state_offset) % 32 == 0); + desc->desc3.sampler_state_pointer = gpgpu->aux_offset.sampler_state_offset >> 5; + desc->desc4.binding_table_entry_count = 0; /* no prefetch */ + desc->desc4.binding_table_pointer = 0; + desc->desc5.curbe_read_len = kernel->curbe_sz / 32; + desc->desc5.curbe_read_offset = 0; + + /* Barriers / SLM are automatically handled on Gen7+ */ + if (gpgpu->drv->gen_ver == 7 || gpgpu->drv->gen_ver == 75) { + size_t slm_sz = kernel->slm_sz; + desc->desc6.group_threads_num = kernel->use_slm ? kernel->thread_n : 0; + desc->desc6.barrier_enable = kernel->use_slm; + if (slm_sz <= 4*KB) + slm_sz = 4*KB; + else if (slm_sz <= 8*KB) + slm_sz = 8*KB; + else if (slm_sz <= 16*KB) + slm_sz = 16*KB; + else if (slm_sz <= 32*KB) + slm_sz = 32*KB; + else + slm_sz = 64*KB; + slm_sz = slm_sz >> 12; + desc->desc6.slm_sz = slm_sz; + } + else + desc->desc6.group_threads_num = kernel->barrierID; /* BarrierID on GEN6 */ } static int @@ -1213,7 +1291,7 @@ intel_gpgpu_set_perf_counters(intel_gpgpu_t *gpgpu, cl_buffer *perf) } static void -intel_gpgpu_walker(intel_gpgpu_t *gpgpu, +intel_gpgpu_walker_gen7(intel_gpgpu_t *gpgpu, uint32_t simd_sz, uint32_t thread_n, const size_t global_wk_off[3], @@ -1258,6 +1336,56 @@ intel_gpgpu_walker(intel_gpgpu_t *gpgpu, ADVANCE_BATCH(gpgpu->batch); } +static void +intel_gpgpu_walker_gen8(intel_gpgpu_t *gpgpu, + uint32_t simd_sz, + uint32_t thread_n, + const size_t global_wk_off[3], + const size_t global_wk_sz[3], + const size_t local_wk_sz[3]) +{ + const uint32_t global_wk_dim[3] = { + global_wk_sz[0] / local_wk_sz[0], + global_wk_sz[1] / local_wk_sz[1], + global_wk_sz[2] / local_wk_sz[2] + }; + uint32_t right_mask = ~0x0; + size_t group_sz = local_wk_sz[0] * local_wk_sz[1] * local_wk_sz[2]; + + assert(simd_sz == 8 || simd_sz == 16); + + uint32_t shift = (group_sz & (simd_sz - 1)); + shift = (shift == 0) ? simd_sz : shift; + right_mask = (1 << shift) - 1; + + BEGIN_BATCH(gpgpu->batch, 15); + OUT_BATCH(gpgpu->batch, CMD_GPGPU_WALKER | 13); + OUT_BATCH(gpgpu->batch, 0); /* kernel index == 0 */ + OUT_BATCH(gpgpu->batch, 0); /* Indirect Data Length */ + OUT_BATCH(gpgpu->batch, 0); /* Indirect Data Start Address */ + assert(thread_n <= 64); + if (simd_sz == 16) + OUT_BATCH(gpgpu->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */ + else + OUT_BATCH(gpgpu->batch, (0 << 30) | (thread_n-1)); /* SIMD8 | thread max */ + OUT_BATCH(gpgpu->batch, 0); + OUT_BATCH(gpgpu->batch, 0); + OUT_BATCH(gpgpu->batch, global_wk_dim[0]); + OUT_BATCH(gpgpu->batch, 0); + OUT_BATCH(gpgpu->batch, 0); + OUT_BATCH(gpgpu->batch, global_wk_dim[1]); + OUT_BATCH(gpgpu->batch, 0); + OUT_BATCH(gpgpu->batch, global_wk_dim[2]); + OUT_BATCH(gpgpu->batch, right_mask); + OUT_BATCH(gpgpu->batch, ~0x0); /* we always set height as 1, so set bottom mask as all 1*/ + ADVANCE_BATCH(gpgpu->batch); + + BEGIN_BATCH(gpgpu->batch, 2); + OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_FLUSH | 0); + OUT_BATCH(gpgpu->batch, 0); /* kernel index == 0 */ + ADVANCE_BATCH(gpgpu->batch); +} + static intel_event_t* intel_gpgpu_event_new(intel_gpgpu_t *gpgpu) { @@ -1504,7 +1632,6 @@ intel_set_gpgpu_callbacks(int device_id) cl_gpgpu_batch_start = (cl_gpgpu_batch_start_cb *) intel_gpgpu_batch_start; cl_gpgpu_batch_end = (cl_gpgpu_batch_end_cb *) intel_gpgpu_batch_end; cl_gpgpu_flush = (cl_gpgpu_flush_cb *) intel_gpgpu_flush; - cl_gpgpu_walker = (cl_gpgpu_walker_cb *) intel_gpgpu_walker; cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler; cl_gpgpu_set_scratch = (cl_gpgpu_set_scratch_cb *) intel_gpgpu_set_scratch; cl_gpgpu_event_new = (cl_gpgpu_event_new_cb *)intel_gpgpu_event_new; @@ -1523,7 +1650,6 @@ intel_set_gpgpu_callbacks(int device_id) cl_gpgpu_get_printf_info = (cl_gpgpu_get_printf_info_cb *)intel_gpgpu_get_printf_info; if (IS_BROADWELL(device_id)) { - intel_gpgpu_setup_bti = intel_gpgpu_setup_bti_gen8; cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen75; intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen75; cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen8; @@ -1531,9 +1657,19 @@ intel_set_gpgpu_callbacks(int device_id) intel_gpgpu_post_action = intel_gpgpu_post_action_gen75; intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7; //HSW same as ivb intel_gpgpu_set_base_address = intel_gpgpu_set_base_address_gen8; + intel_gpgpu_setup_bti = intel_gpgpu_setup_bti_gen8; + intel_gpgpu_load_vfe_state = intel_gpgpu_load_vfe_state_gen8; + cl_gpgpu_walker = (cl_gpgpu_walker_cb *)intel_gpgpu_walker_gen8; + intel_gpgpu_build_idrt = intel_gpgpu_build_idrt_gen8; return; } + intel_gpgpu_set_base_address = intel_gpgpu_set_base_address_gen7; + intel_gpgpu_setup_bti = intel_gpgpu_setup_bti_gen7; + intel_gpgpu_load_vfe_state = intel_gpgpu_load_vfe_state_gen7; + cl_gpgpu_walker = (cl_gpgpu_walker_cb *)intel_gpgpu_walker_gen7; + intel_gpgpu_build_idrt = intel_gpgpu_build_idrt_gen7; + if (IS_HASWELL(device_id)) { cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen75; intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen75; @@ -1541,8 +1677,6 @@ intel_set_gpgpu_callbacks(int device_id) intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen75; intel_gpgpu_post_action = intel_gpgpu_post_action_gen75; intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7; //HSW same as ivb - intel_gpgpu_set_base_address = intel_gpgpu_set_base_address_gen7; - intel_gpgpu_setup_bti = intel_gpgpu_setup_bti_gen7; } else if (IS_IVYBRIDGE(device_id)) { cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen7; @@ -1556,7 +1690,5 @@ intel_set_gpgpu_callbacks(int device_id) cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen7; intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen7; intel_gpgpu_post_action = intel_gpgpu_post_action_gen7; - intel_gpgpu_set_base_address = intel_gpgpu_set_base_address_gen7; - intel_gpgpu_setup_bti = intel_gpgpu_setup_bti_gen7; } } diff --git a/src/intel/intel_structs.h b/src/intel/intel_structs.h index 2d8bc54..40185bf 100644 --- a/src/intel/intel_structs.h +++ b/src/intel/intel_structs.h @@ -98,6 +98,61 @@ typedef struct gen6_interface_descriptor uint32_t desc7; /* unused */ } gen6_interface_descriptor_t; +typedef struct gen8_interface_descriptor +{ + struct { + uint32_t pad6:6; + uint32_t kernel_start_pointer:26; + } desc0; + struct { + uint32_t kernel_start_pointer_high:16; + uint32_t pad6:16; + } desc1; + + struct { + uint32_t pad:7; + uint32_t software_exception:1; + uint32_t pad2:3; + uint32_t maskstack_exception:1; + uint32_t pad3:1; + uint32_t illegal_opcode_exception:1; + uint32_t pad4:2; + uint32_t floating_point_mode:1; + uint32_t thread_priority:1; + uint32_t single_program_flow:1; + uint32_t denorm_mode:1; + uint32_t thread_preemption_disable:1; + uint32_t pad5:11; + } desc2; + + struct { + uint32_t pad:2; + uint32_t sampler_count:3; + uint32_t sampler_state_pointer:27; + } desc3; + + struct { + uint32_t binding_table_entry_count:5; /* prefetch entries only */ + uint32_t binding_table_pointer:27; /* 11 bit only on IVB+ */ + } desc4; + + struct { + uint32_t curbe_read_offset:16; /* in GRFs */ + uint32_t curbe_read_len:16; /* in GRFs */ + } desc5; + + struct { + uint32_t group_threads_num:8; /* 0..64, 0 - no barrier use */ + uint32_t barrier_return_byte:8; + uint32_t slm_sz:5; /* 0..16 - 0K..64K */ + uint32_t barrier_enable:1; + uint32_t rounding_mode:2; + uint32_t barrier_return_grf_offset:8; + } desc6; + + uint32_t desc7; /* unused */ +} gen8_interface_descriptor_t; + typedef struct gen7_surface_state { struct { @@ -180,17 +235,12 @@ typedef struct gen7_surface_state typedef struct gen8_surface_state { struct { - union { - struct { - uint32_t cube_pos_z:1; - uint32_t cube_neg_z:1; - uint32_t cube_pos_y:1; - uint32_t cube_neg_y:1; - uint32_t cube_pos_x:1; - uint32_t cube_neg_x:1; - }; - uint32_t pad1:6; - }; + uint32_t cube_pos_z:1; + uint32_t cube_neg_z:1; + uint32_t cube_pos_y:1; + uint32_t cube_neg_y:1; + uint32_t cube_pos_x:1; + uint32_t cube_neg_x:1; uint32_t media_boundary_pixel_mode:2; uint32_t render_cache_rw_mode:1; uint32_t sampler_L2_bypass_mode:1; @@ -252,10 +302,7 @@ typedef struct gen8_surface_state uint32_t conherency_type:1; uint32_t pad3:3; uint32_t pad2:2; - union { - uint32_t pad1:1; - uint32_t cube_ewa:1; - }; + uint32_t cube_ewa:1; uint32_t y_offset:3; uint32_t pad0:1; uint32_t x_offset:7; @@ -302,25 +349,20 @@ typedef struct gen8_surface_state struct { uint32_t surface_base_addr_lo; - uint32_t surface_base_addr_hi; - } ss8_9; + } ss8; struct { - uint32_t pad5:10; - uint32_t pad4:1; - uint32_t pad3:1; + uint32_t surface_base_addr_hi; + } ss9; - union { - uint64_t aux_surface_base_addr:52; - struct { - uint32_t pad2:20; - uint32_t v_plane_y_offset:14; - uint32_t pad1:2; - uint32_t v_plane_x_offset:14; - uint32_t pad0:2; - }; - }; - } ss10_11; + struct { + uint32_t pad0:12; + uint32_t aux_base_addr_lo:20; + } ss10; + + struct { + uint32_t aux_base_addr_hi:32; + } ss11; struct { uint32_t pad0; -- 2.7.4