From: bsegovia Date: Fri, 5 Aug 2011 04:46:06 +0000 (+0000) Subject: Now IVB performs as well as SNB. All tests except for fft pass (for both). X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=2b069ade5fc4ea3c23839771915b78d561ee9d28;p=contrib%2Fbeignet.git Now IVB performs as well as SNB. All tests except for fft pass (for both). --- diff --git a/kernels/dct_kernels.cl b/kernels/dct_kernels.cl index ed1e0f1..76ebe20 100644 --- a/kernels/dct_kernels.cl +++ b/kernels/dct_kernels.cl @@ -59,11 +59,10 @@ void DCT(__global float * output, { uint index1 = (inverse)? i*blockWidth + k : k * blockWidth + i; uint index2 = getIdx(groupIdx, groupIdy, j, k, blockWidth, width); - acc += dct8x8[index1] * input[index2]; } - inter[j*blockWidth + i] = acc; + inter[j*blockWidth + i] = acc; /* * Make sure all the values of inter that belong to a block * are calculated before proceeding further @@ -81,6 +80,6 @@ void DCT(__global float * output, acc += inter[index1] * dct8x8[index2]; } - output[idx] = acc; + output[idx] = acc; } diff --git a/kernels/matmul_kernels.cl b/kernels/matmul_kernels.cl index b414e29..d271030 100644 --- a/kernels/matmul_kernels.cl +++ b/kernels/matmul_kernels.cl @@ -9,7 +9,7 @@ __kernel void mmmKernel(__global float4 *matrixA, __global float4 *matrixB, __global float4* matrixC, - uint widthA, uint widthB) + uint widthA, uint widthB) { int2 pos = (int2)(get_global_id(0), get_global_id(1)); @@ -224,4 +224,4 @@ __kernel void mmmKernel_local2(__global float4 *matrixA, matrixC[get_global_id(0) + (get_global_id(1) << TILEY_SHIFT) * get_global_size(0) + get_global_size(0)] = sum1; matrixC[get_global_id(0) + (get_global_id(1) << TILEY_SHIFT) * get_global_size(0) + 2 * get_global_size(0)] = sum2; matrixC[get_global_id(0) + (get_global_id(1) << TILEY_SHIFT) * get_global_size(0) + 3 * get_global_size(0)] = sum3; -} \ No newline at end of file +} diff --git a/src/cl_api.c b/src/cl_api.c index dee6785..0b37ba8 100644 --- a/src/cl_api.c +++ b/src/cl_api.c @@ -1124,6 +1124,5 @@ clFulsimSetOutputBuffer(cl_command_queue queue, cl_mem mem) err = cl_command_queue_set_fulsim_buffer(queue, mem); error: return err; - } diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c index b4e2735..76d8c72 100644 --- a/src/cl_command_queue.c +++ b/src/cl_command_queue.c @@ -82,6 +82,10 @@ cl_command_queue_delete(cl_command_queue queue) if (queue->next == NULL && queue->prev == NULL) queue->ctx->queues = NULL; pthread_mutex_unlock(&queue->ctx->queue_lock); + if (queue->fulsim_out != NULL) { + cl_mem_delete(queue->fulsim_out); + queue->fulsim_out = NULL; + } cl_mem_delete(queue->perf); cl_context_delete(queue->ctx); intel_gpgpu_delete(queue->gpgpu); @@ -292,21 +296,18 @@ LOCAL cl_int cl_command_queue_set_fulsim_buffer(cl_command_queue queue, cl_mem mem) { #if USE_FULSIM - cl_context ctx = queue->ctx; - drm_intel_bufmgr *bufmgr = cl_context_get_intel_bufmgr(ctx); - drm_intel_aub_set_bo_to_dump(bufmgr, mem->bo); -#endif /* USE_FULSIM */ - - queue->fulsim_out = mem; if (queue->fulsim_out != NULL) { cl_mem_delete(queue->fulsim_out); queue->fulsim_out = NULL; } if (mem != NULL) { + cl_context ctx = queue->ctx; + drm_intel_bufmgr *bufmgr = cl_context_get_intel_bufmgr(ctx); + drm_intel_aub_set_bo_to_dump(bufmgr, mem->bo); cl_mem_add_ref(mem); queue->fulsim_out = mem; } - +#endif /* USE_FULSIM */ return CL_SUCCESS; } diff --git a/src/cl_command_queue_gen6.c b/src/cl_command_queue_gen6.c index 763b757..4ee8b02 100644 --- a/src/cl_command_queue_gen6.c +++ b/src/cl_command_queue_gen6.c @@ -186,9 +186,9 @@ cl_command_queue_ND_range_gen6(cl_command_queue queue, * buffers and reuse them */ curr = 0; - for (i = 0; i < local_wk_sz[0]; ++i) + for (k = 0; k < local_wk_sz[2]; ++k) for (j = 0; j < local_wk_sz[1]; ++j) - for (k = 0; k < local_wk_sz[2]; ++k, ++curr) { + for (i = 0; i < local_wk_sz[0]; ++i, ++curr) { ((uint16_t*) ids[0])[curr] = i; ((uint16_t*) ids[1])[curr] = j; ((uint16_t*) ids[2])[curr] = k; diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c index 9a65d98..f378290 100644 --- a/src/cl_command_queue_gen7.c +++ b/src/cl_command_queue_gen7.c @@ -55,12 +55,12 @@ cl_set_local_ids(char *data, TRY_ALLOC(ids[i], (uint16_t*) cl_calloc(sizeof(uint16_t), thread_n*16)); /* Compute the IDs */ - for (i = 0; i < local_wk_sz[0]; ++i) + for (k = 0; k < local_wk_sz[2]; ++k) for (j = 0; j < local_wk_sz[1]; ++j) - for (k = 0; k < local_wk_sz[2]; ++k, ++curr) { - ((uint16_t*) ids[0])[curr] = i; - ((uint16_t*) ids[1])[curr] = j; - ((uint16_t*) ids[2])[curr] = k; + for (i = 0; i < local_wk_sz[0]; ++i, ++curr) { + ids[0][curr] = i; + ids[1][curr] = j; + ids[2][curr] = k; } /* Copy them to the constant buffer */ @@ -70,7 +70,7 @@ cl_set_local_ids(char *data, uint16_t *ids0 = (uint16_t *) (data + 0); uint16_t *ids1 = (uint16_t *) (data + 32); uint16_t *ids2 = (uint16_t *) (data + 64); - for (j = 0; j < 16; ++j, ++curr) {/* SIMD16 */ + for (j = 0; j < 16; ++j, ++curr) { ids0[j] = ids[0][curr]; ids1[j] = ids[1][curr]; ids2[j] = ids[2][curr]; diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c index 002992e..6929d10 100644 --- a/src/intel/intel_gpgpu.c +++ b/src/intel/intel_gpgpu.c @@ -1099,16 +1099,22 @@ gpgpu_walker(intel_gpgpu_t *state, const size_t global_wk_sz[3], const size_t local_wk_sz[3]) { + const uint32_t global_wk_dim[3] = { + global_wk_sz[0] / local_wk_sz[0], + global_wk_sz[1] / local_wk_sz[1], + global_wk_sz[2] / local_wk_sz[2] + }; + BEGIN_BATCH(state->batch, 11); OUT_BATCH(state->batch, CMD_GPGPU_WALKER | 9); OUT_BATCH(state->batch, 0); /* kernel index == 0 */ OUT_BATCH(state->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */ OUT_BATCH(state->batch, global_wk_off[0]); - OUT_BATCH(state->batch, global_wk_sz[0]-1); + OUT_BATCH(state->batch, global_wk_dim[0]); OUT_BATCH(state->batch, global_wk_off[1]); - OUT_BATCH(state->batch, global_wk_sz[1]-1); + OUT_BATCH(state->batch, global_wk_dim[1]); OUT_BATCH(state->batch, global_wk_off[2]); - OUT_BATCH(state->batch, global_wk_sz[2]-1); + OUT_BATCH(state->batch, global_wk_dim[2]); OUT_BATCH(state->batch, ~0x0); OUT_BATCH(state->batch, ~0x0); ADVANCE_BATCH(state->batch);