{
uint index1 = (inverse)? i*blockWidth + k : k * blockWidth + i;
uint index2 = getIdx(groupIdx, groupIdy, j, k, blockWidth, width);
-
acc += dct8x8[index1] * input[index2];
}
- inter[j*blockWidth + i] = acc;
+ inter[j*blockWidth + i] = acc;
/*
* Make sure all the values of inter that belong to a block
* are calculated before proceeding further
acc += inter[index1] * dct8x8[index2];
}
- output[idx] = acc;
+ output[idx] = acc;
}
__kernel void mmmKernel(__global float4 *matrixA,
__global float4 *matrixB,
__global float4* matrixC,
- uint widthA, uint widthB)
+ uint widthA, uint widthB)
{
int2 pos = (int2)(get_global_id(0), get_global_id(1));
matrixC[get_global_id(0) + (get_global_id(1) << TILEY_SHIFT) * get_global_size(0) + get_global_size(0)] = sum1;
matrixC[get_global_id(0) + (get_global_id(1) << TILEY_SHIFT) * get_global_size(0) + 2 * get_global_size(0)] = sum2;
matrixC[get_global_id(0) + (get_global_id(1) << TILEY_SHIFT) * get_global_size(0) + 3 * get_global_size(0)] = sum3;
-}
\ No newline at end of file
+}
err = cl_command_queue_set_fulsim_buffer(queue, mem);
error:
return err;
-
}
if (queue->next == NULL && queue->prev == NULL)
queue->ctx->queues = NULL;
pthread_mutex_unlock(&queue->ctx->queue_lock);
+ if (queue->fulsim_out != NULL) {
+ cl_mem_delete(queue->fulsim_out);
+ queue->fulsim_out = NULL;
+ }
cl_mem_delete(queue->perf);
cl_context_delete(queue->ctx);
intel_gpgpu_delete(queue->gpgpu);
cl_command_queue_set_fulsim_buffer(cl_command_queue queue, cl_mem mem)
{
#if USE_FULSIM
- cl_context ctx = queue->ctx;
- drm_intel_bufmgr *bufmgr = cl_context_get_intel_bufmgr(ctx);
- drm_intel_aub_set_bo_to_dump(bufmgr, mem->bo);
-#endif /* USE_FULSIM */
-
- queue->fulsim_out = mem;
if (queue->fulsim_out != NULL) {
cl_mem_delete(queue->fulsim_out);
queue->fulsim_out = NULL;
}
if (mem != NULL) {
+ cl_context ctx = queue->ctx;
+ drm_intel_bufmgr *bufmgr = cl_context_get_intel_bufmgr(ctx);
+ drm_intel_aub_set_bo_to_dump(bufmgr, mem->bo);
cl_mem_add_ref(mem);
queue->fulsim_out = mem;
}
-
+#endif /* USE_FULSIM */
return CL_SUCCESS;
}
* buffers and reuse them
*/
curr = 0;
- for (i = 0; i < local_wk_sz[0]; ++i)
+ for (k = 0; k < local_wk_sz[2]; ++k)
for (j = 0; j < local_wk_sz[1]; ++j)
- for (k = 0; k < local_wk_sz[2]; ++k, ++curr) {
+ for (i = 0; i < local_wk_sz[0]; ++i, ++curr) {
((uint16_t*) ids[0])[curr] = i;
((uint16_t*) ids[1])[curr] = j;
((uint16_t*) ids[2])[curr] = k;
TRY_ALLOC(ids[i], (uint16_t*) cl_calloc(sizeof(uint16_t), thread_n*16));
/* Compute the IDs */
- for (i = 0; i < local_wk_sz[0]; ++i)
+ for (k = 0; k < local_wk_sz[2]; ++k)
for (j = 0; j < local_wk_sz[1]; ++j)
- for (k = 0; k < local_wk_sz[2]; ++k, ++curr) {
- ((uint16_t*) ids[0])[curr] = i;
- ((uint16_t*) ids[1])[curr] = j;
- ((uint16_t*) ids[2])[curr] = k;
+ for (i = 0; i < local_wk_sz[0]; ++i, ++curr) {
+ ids[0][curr] = i;
+ ids[1][curr] = j;
+ ids[2][curr] = k;
}
/* Copy them to the constant buffer */
uint16_t *ids0 = (uint16_t *) (data + 0);
uint16_t *ids1 = (uint16_t *) (data + 32);
uint16_t *ids2 = (uint16_t *) (data + 64);
- for (j = 0; j < 16; ++j, ++curr) {/* SIMD16 */
+ for (j = 0; j < 16; ++j, ++curr) {
ids0[j] = ids[0][curr];
ids1[j] = ids[1][curr];
ids2[j] = ids[2][curr];
const size_t global_wk_sz[3],
const size_t local_wk_sz[3])
{
+ const uint32_t global_wk_dim[3] = {
+ global_wk_sz[0] / local_wk_sz[0],
+ global_wk_sz[1] / local_wk_sz[1],
+ global_wk_sz[2] / local_wk_sz[2]
+ };
+
BEGIN_BATCH(state->batch, 11);
OUT_BATCH(state->batch, CMD_GPGPU_WALKER | 9);
OUT_BATCH(state->batch, 0); /* kernel index == 0 */
OUT_BATCH(state->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
OUT_BATCH(state->batch, global_wk_off[0]);
- OUT_BATCH(state->batch, global_wk_sz[0]-1);
+ OUT_BATCH(state->batch, global_wk_dim[0]);
OUT_BATCH(state->batch, global_wk_off[1]);
- OUT_BATCH(state->batch, global_wk_sz[1]-1);
+ OUT_BATCH(state->batch, global_wk_dim[1]);
OUT_BATCH(state->batch, global_wk_off[2]);
- OUT_BATCH(state->batch, global_wk_sz[2]-1);
+ OUT_BATCH(state->batch, global_wk_dim[2]);
OUT_BATCH(state->batch, ~0x0);
OUT_BATCH(state->batch, ~0x0);
ADVANCE_BATCH(state->batch);