typedef pstatus_t (*__orC_32u_t)(const UINT32* pSrc, UINT32 val, UINT32* pDst, INT32 len);
typedef pstatus_t (*primitives_uninit_t)(void);
-
typedef struct
{
/* Memory-to-memory copy routines */
FREERDP_API void primitives_set_hints(primitive_hints hints);
FREERDP_API primitive_hints primitives_get_hints(void);
FREERDP_API primitives_t* primitives_get_generic(void);
- FREERDP_API DWORD primitives_flags(primitives_t *p);
- FREERDP_API BOOL primitives_init(primitives_t *p, primitive_hints hints);
+ FREERDP_API DWORD primitives_flags(primitives_t* p);
+ FREERDP_API BOOL primitives_init(primitives_t* p, primitive_hints hints);
FREERDP_API void primitives_uninit();
-
#ifdef __cplusplus
}
#endif
endif()
if (WITH_OPENCL)
- freerdp_definition_add(-DOPENCL_SOURCE_PATH="${CMAKE_INSTALL_PREFIX}/${FREERDP_PLUGIN_PATH}")
set(PRIMITIVES_OPENCL_SRCS primitives/prim_YUV_opencl.c)
freerdp_include_directory_add(${OpenCL_INCLUDE_DIRS})
freerdp_library_add(OpenCL::OpenCL)
- install(FILES primitives/primitives.cl DESTINATION ${FREERDP_PLUGIN_PATH})
endif()
set(PRIMITIVES_OPT_SRCS
if (order->fieldFlags & WINDOW_ORDER_FIELD_TITLE)
DUMP_APPEND(buffer, bufferSize, " title");
if (order->fieldFlags & WINDOW_ORDER_FIELD_CLIENT_AREA_OFFSET)
- DUMP_APPEND(buffer, bufferSize, " clientOffset=(%"PRId32",%"PRId32")",
- state->clientOffsetX, state->clientOffsetY);
+ DUMP_APPEND(buffer, bufferSize, " clientOffset=(%" PRId32 ",%" PRId32 ")",
+ state->clientOffsetX, state->clientOffsetY);
if (order->fieldFlags & WINDOW_ORDER_FIELD_CLIENT_AREA_SIZE)
DUMP_APPEND(buffer, bufferSize, " clientAreaWidth=%" PRIu32 " clientAreaHeight=%" PRIu32 "",
state->clientAreaWidth, state->clientAreaHeight);
if (order->fieldFlags & WINDOW_ORDER_FIELD_ROOT_PARENT)
DUMP_APPEND(buffer, bufferSize, " rootParent=0x%" PRIx32 "", state->rootParentHandle);
if (order->fieldFlags & WINDOW_ORDER_FIELD_WND_OFFSET)
- DUMP_APPEND(buffer, bufferSize, " windowOffset=(%"PRId32",%"PRId32")", state->windowOffsetX, state->windowOffsetY);
+ DUMP_APPEND(buffer, bufferSize, " windowOffset=(%" PRId32 ",%" PRId32 ")",
+ state->windowOffsetX, state->windowOffsetY);
if (order->fieldFlags & WINDOW_ORDER_FIELD_WND_CLIENT_DELTA)
- DUMP_APPEND(buffer, bufferSize, " windowClientDelta=(%"PRId32",%"PRId32")",
- state->windowClientDeltaX, state->windowClientDeltaY);
+ DUMP_APPEND(buffer, bufferSize, " windowClientDelta=(%" PRId32 ",%" PRId32 ")",
+ state->windowClientDeltaX, state->windowClientDeltaY);
if (order->fieldFlags & WINDOW_ORDER_FIELD_WND_SIZE)
DUMP_APPEND(buffer, bufferSize, " windowWidth=%" PRIu32 " windowHeight=%" PRIu32 "",
state->windowWidth, state->windowHeight);
}
if (order->fieldFlags & WINDOW_ORDER_FIELD_VIS_OFFSET)
- DUMP_APPEND(buffer, bufferSize, " visibleOffset=(%"PRId32",%"PRId32")", state->visibleOffsetX,
- state->visibleOffsetY);
+ DUMP_APPEND(buffer, bufferSize, " visibleOffset=(%" PRId32 ",%" PRId32 ")",
+ state->visibleOffsetX, state->visibleOffsetY);
if (order->fieldFlags & WINDOW_ORDER_FIELD_VISIBILITY)
{
{
cl_int ret;
int i;
- cl_mem objs[3] = {NULL, NULL, NULL};
+ cl_mem objs[3] = { NULL, NULL, NULL };
cl_mem destObj;
cl_kernel kernel;
+ cl_event events[3];
size_t indexes[2];
- const char *sourceNames[] = {"Y", "U", "V"};
- primitives_opencl_context *cl = primitives_get_opencl_context();
+ const char* sourceNames[] = { "Y", "U", "V" };
+ primitives_opencl_context* cl = primitives_get_opencl_context();
kernel = clCreateKernel(cl->program, kernelName, &ret);
if (ret != CL_SUCCESS)
for (i = 0; i < 3; i++)
{
- objs[i] = clCreateBuffer(cl->context, CL_MEM_READ_ONLY, srcStep[i] * roi->height, NULL, &ret);
+ objs[i] =
+ clCreateBuffer(cl->context, CL_MEM_READ_ONLY, srcStep[i] * roi->height, NULL, &ret);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "unable to create %sobj", sourceNames[i]);
goto error_objs;
}
- ret = clEnqueueWriteBuffer(cl->commandQueue, objs[i], CL_TRUE, 0, srcStep[i] * roi->height,
- pSrc[i], 0, NULL, NULL);
+ ret = clEnqueueWriteBuffer(cl->commandQueue, objs[i], CL_FALSE, 0, srcStep[i] * roi->height,
+ pSrc[i], 0, NULL, &events[i]);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "unable to enqueue write command for %sobj", sourceNames[i]);
/* push source + stride arguments*/
for (i = 0; i < 3; i++)
{
- ret = clSetKernelArg(kernel, i * 2, sizeof(cl_mem), (void *)&objs[i]);
+ ret = clSetKernelArg(kernel, i * 2, sizeof(cl_mem), (void*)&objs[i]);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "unable to set arg for %sobj", sourceNames[i]);
goto error_set_args;
}
- ret = clSetKernelArg(kernel, i * 2 + 1, sizeof(cl_int), (void *)&srcStep[i]);
+ ret = clSetKernelArg(kernel, i * 2 + 1, sizeof(cl_int), (void*)&srcStep[i]);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "unable to set arg stride for %sobj", sourceNames[i]);
}
}
- ret = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void *)&destObj);
+ ret = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&destObj);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "unable to set arg destObj");
goto error_set_args;
}
- ret = clSetKernelArg(kernel, 7, sizeof(cl_int), (void *)&dstStep);
+ ret = clSetKernelArg(kernel, 7, sizeof(cl_int), (void*)&dstStep);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "unable to set arg dstStep");
indexes[0] = roi->width;
indexes[1] = roi->height;
- ret = clEnqueueNDRangeKernel(cl->commandQueue, kernel, 2, NULL, indexes, NULL,
- 0, NULL, NULL);
+ ret = clEnqueueNDRangeKernel(cl->commandQueue, kernel, 2, NULL, indexes, NULL, 3, events, NULL);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "unable to enqueue call kernel");
}
/* Transfer result to host */
- ret = clEnqueueReadBuffer(cl->commandQueue, destObj, CL_TRUE, 0, roi->height * dstStep, pDst, 0, NULL, NULL);
+ ret = clEnqueueReadBuffer(cl->commandQueue, destObj, CL_TRUE, 0, roi->height * dstStep, pDst, 0,
+ NULL, NULL);
if (ret != CL_SUCCESS)
{
WLog_ERR(TAG, "unable to read back buffer");
return PRIMITIVES_SUCCESS;
}
+static const char* openclProgram =
+#include "primitives.cl"
+ ;
+
BOOL primitives_init_opencl_context(primitives_opencl_context* cl)
{
cl_platform_id* platform_ids = NULL;
cl_uint ndevices, nplatforms, i;
cl_kernel kernel;
cl_int ret;
- char sourcePath[1000];
BOOL gotGPU = FALSE;
- FILE* f;
size_t programLen;
- char* programSource;
ret = clGetPlatformIDs(0, NULL, &nplatforms);
if (ret != CL_SUCCESS || nplatforms < 1)
return FALSE;
}
- snprintf(sourcePath, sizeof(sourcePath), "%s/primitives.cl", OPENCL_SOURCE_PATH);
-
- f = fopen(sourcePath, "r");
- if (!f)
- {
- WLog_ERR(TAG, "openCL: unable to open source file %s", sourcePath);
- goto error_source_file;
- }
-
- fseek(f, 0, SEEK_END);
- programLen = ftell(f);
- fseek(f, 0, SEEK_SET);
-
- programSource = malloc(programLen);
- if (!programSource)
- {
- WLog_ERR(TAG, "openCL: unable to allocate memory(%d bytes) for source file %s", programLen,
- sourcePath);
- fclose(f);
- goto error_source_file;
- }
-
- if (fread(programSource, programLen, 1, f) <= 0)
- {
- WLog_ERR(TAG, "openCL: unable to read openCL program in %s", sourcePath);
- free(programSource);
- fclose(f);
- goto error_source_file;
- }
- fclose(f);
-
+ programLen = strlen(openclProgram);
cl->program =
- clCreateProgramWithSource(cl->context, 1, (const char**)&programSource, &programLen, &ret);
+ clCreateProgramWithSource(cl->context, 1, (const char**)&openclProgram, &programLen, &ret);
if (ret != CL_SUCCESS)
{
- WLog_ERR(TAG, "openCL: unable to create command queue");
+ WLog_ERR(TAG, "openCL: unable to create program");
goto out_program_create;
}
- free(programSource);
ret = clBuildProgram(cl->program, 1, &cl->deviceId, NULL, NULL, NULL);
if (ret != CL_SUCCESS)
out_program_build:
clReleaseProgram(cl->program);
-error_source_file:
out_program_create:
clReleaseCommandQueue(cl->commandQueue);
clReleaseContext(cl->context);
}
static pstatus_t opencl_YUV420ToRGB_8u_P3AC4R(const BYTE* pSrc[3], const UINT32 srcStep[3],
- BYTE* pDst, UINT32 dstStep, UINT32 DstFormat, const prim_size_t* roi)
+ BYTE* pDst, UINT32 dstStep, UINT32 DstFormat,
+ const prim_size_t* roi)
{
- const char *kernel_name;
+ const char* kernel_name;
- switch(DstFormat)
+ switch (DstFormat)
{
case PIXEL_FORMAT_BGRA32:
case PIXEL_FORMAT_BGRX32:
void primitives_init_YUV_opencl(primitives_t* prims)
{
prims->YUV420ToRGB_8u_P3AC4R = opencl_YUV420ToRGB_8u_P3AC4R;
-
}
-
-
}
}
-
void primitives_init_YUV_opt(primitives_t* prims)
{
generic = primitives_get_generic();
#define HAVE_CPU_OPTIMIZED_PRIMITIVES 1
#endif
-
#if defined(WITH_SSE2)
/* Use lddqu for unaligned; load for 16-byte aligned. */
#define LOAD_SI128(_ptr_) \
return primitivesHints;
}
-
/* Singleton pointer used throughout the program when requested. */
static primitives_t pPrimitivesGeneric = { 0 };
static INIT_ONCE generic_primitives_InitOnce = INIT_ONCE_STATIC_INIT;
#endif
-#if defined(HAVE_OPTIMIZED_PRIMITIVES)
static INIT_ONCE auto_primitives_InitOnce = INIT_ONCE_STATIC_INIT;
-#endif
static primitives_t pPrimitives = { 0 };
/* ------------------------------------------------------------------------- */
-static BOOL primitives_init_generic(primitives_t *prims)
+static BOOL primitives_init_generic(primitives_t* prims)
{
primitives_init_add(prims);
primitives_init_andor(prims);
return primitives_init_generic(&pPrimitivesGeneric);
}
-static BOOL primitives_init_optimized(primitives_t *prims)
+static BOOL primitives_init_optimized(primitives_t* prims)
{
primitives_init_generic(prims);
return TRUE;
}
-typedef struct {
- BYTE *channels[3];
+typedef struct
+{
+ BYTE* channels[3];
UINT32 steps[3];
prim_size_t roi;
- BYTE *outputBuffer;
+ BYTE* outputBuffer;
UINT32 outputStride;
UINT32 testedFormat;
} primitives_YUV_benchmark;
for (i = 0; i < 3; i++)
{
- BYTE *buf = ret->channels[i] = malloc(roi->width * roi->height);
+ BYTE* buf = ret->channels[i] = malloc(roi->width * roi->height);
if (!buf)
goto fail;
return ret;
}
-static BOOL primitives_YUV_benchmark_run(primitives_YUV_benchmark *bench, primitives_t *prims,
- UINT64 runTime, UINT32 *computations)
+static BOOL primitives_YUV_benchmark_run(primitives_YUV_benchmark* bench, primitives_t* prims,
+ UINT64 runTime, UINT32* computations)
{
- ULONGLONG dueDate = GetTickCount64() + runTime;
- const BYTE *channels[3];
+ ULONGLONG dueDate;
+ const BYTE* channels[3];
int i;
*computations = 0;
for (i = 0; i < 3; i++)
channels[i] = bench->channels[i];
+ /* do a first dry run to initialize cache and such */
+ pstatus_t status =
+ prims->YUV420ToRGB_8u_P3AC4R(channels, bench->steps, bench->outputBuffer,
+ bench->outputStride, bench->testedFormat, &bench->roi);
+ if (status != PRIMITIVES_SUCCESS)
+ return FALSE;
+
+ /* let's run the benchmark */
+ dueDate = GetTickCount64() + runTime;
while (GetTickCount64() < dueDate)
{
- pstatus_t status = prims->YUV420ToRGB_8u_P3AC4R(channels, bench->steps, bench->outputBuffer,
- bench->outputStride, bench->testedFormat, &bench->roi);
+ pstatus_t status =
+ prims->YUV420ToRGB_8u_P3AC4R(channels, bench->steps, bench->outputBuffer,
+ bench->outputStride, bench->testedFormat, &bench->roi);
if (status != PRIMITIVES_SUCCESS)
return FALSE;
*computations = *computations + 1;
return TRUE;
}
-static BOOL primitives_autodetect_best(primitives_t *prims)
+static BOOL primitives_autodetect_best(primitives_t* prims)
{
BOOL ret = FALSE;
- UINT64 benchDuration = 150; // 100 ms
+ UINT64 benchDuration = 150; /* 150 ms */
UINT32 genericCount = 0;
UINT32 bestCount;
primitives_t* genericPrims = primitives_get_generic();
}
#endif
- WLog_DBG(TAG,
- "benchmark result: generic=%" PRIu32
+ WLog_DBG(TAG, "primitives benchmark result:");
+ WLog_DBG(TAG, " * generic=%" PRIu32, genericCount);
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
- " optimized=%" PRIu32
+ WLog_DBG(TAG, " * optimized=%" PRIu32, optimizedCount);
#endif
#if defined(WITH_OPENCL)
- " openCL=%" PRIu32
-#endif
- ,
- genericCount
-#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
- ,
- optimizedCount
+ WLog_DBG(TAG, " * openCL=%" PRIu32, openclCount);
#endif
-#if defined(WITH_OPENCL)
- ,
- openclCount
-#endif
- );
WLog_INFO(TAG, "primitives autodetect, using %s", primName);
ret = TRUE;
out:
WINPR_UNUSED(param);
WINPR_UNUSED(context);
+ if (!primitives_init_optimized(&pPrimitivesGpu))
+ return FALSE;
+
if (!primitives_init_opencl(&pPrimitivesGpu))
return FALSE;
WINPR_UNUSED(param);
WINPR_UNUSED(context);
- if (!primitives_init_optimized(&pPrimitivesCpu))
- return FALSE;
- return TRUE;
+ return primitives_init_optimized(&pPrimitivesCpu);
}
#endif
BOOL primitives_init(primitives_t* p, primitive_hints hints)
{
- switch(hints)
+ switch (hints)
{
case PRIMITIVES_AUTODETECT:
return primitives_autodetect_best(p);
case PRIMITIVES_PURE_SOFT:
*p = pPrimitivesGeneric;
return TRUE;
-#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
case PRIMITIVES_ONLY_CPU:
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
*p = pPrimitivesCpu;
- return TRUE;
+#else
+ *p = pPrimitivesGeneric;
#endif
-#if defined(WITH_OPENCL)
+ return TRUE;
case PRIMITIVES_ONLY_GPU:
+#if defined(WITH_OPENCL)
*p = pPrimitivesGpu;
return TRUE;
+#else
+ return FALSE;
#endif
default:
WLog_ERR(TAG, "unknown hint %d", hints);
#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
InitOnceExecuteOnce(&cpu_primitives_InitOnce, primitives_init_cpu_cb, NULL, NULL);
#endif
-#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+#if defined(WITH_OPENCL)
InitOnceExecuteOnce(&gpu_primitives_InitOnce, primitives_init_gpu_cb, NULL, NULL);
#endif
InitOnceExecuteOnce(&auto_primitives_InitOnce, primitives_auto_init_cb, NULL, NULL);
primitives_t* primitives_get_by_type(DWORD type)
{
InitOnceExecuteOnce(&generic_primitives_InitOnce, primitives_init_generic_cb, NULL, NULL);
+
switch (type)
{
-#if defined(WITH_OPENCL)
case PRIMITIVES_ONLY_GPU:
- InitOnceExecuteOnce(&gpu_primitives_InitOnce, primitives_init_cpu_cb, NULL, NULL);
- return &pPrimitivesGpu;
+#if defined(WITH_OPENCL)
+ if (InitOnceExecuteOnce(&gpu_primitives_InitOnce, primitives_init_gpu_cb, NULL, NULL))
+ return &pPrimitivesGpu;
#endif
-#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+ return NULL;
+
case PRIMITIVES_ONLY_CPU:
- InitOnceExecuteOnce(&cpu_primitives_InitOnce, primitives_init_cpu_cb, NULL, NULL);
- return &pPrimitivesCpu;
+#if defined(HAVE_CPU_OPTIMIZED_PRIMITIVES)
+ if (InitOnceExecuteOnce(&cpu_primitives_InitOnce, primitives_init_cpu_cb, NULL, NULL))
+ return &pPrimitivesCpu;
#endif
+ return NULL;
+
case PRIMITIVES_PURE_SOFT:
default:
return &pPrimitivesGeneric;
/**
* FreeRDP: A Remote Desktop Protocol Implementation
* Optimized operations using openCL
+ * vi:ts=4 sw=4
*
* Copyright 2019 David Fort <contact@hardening-consulting.com>
+ * Copyright 2019 Rangee Gmbh
*
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
+ * Licensed under the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License. You may obtain
+ * a copy of the License at http://www.apache.org/licenses/LICENSE-2.0.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing
+ * permissions and limitations under the License.
*/
+#define STRINGIFY(x) #x
+
+STRINGIFY(
unsigned char clamp_uc(int v, int l, int h)
{
if (v > h)
destPtr[2] = clamp_uc((y256 + (403 * V)) >> 8, 0, 255); /* R */
destPtr[3] = 0xff; /* A */
}
-
+)