# the run-time itself
ADD_SUBDIRECTORY(src)
+ADD_SUBDIRECTORY(tests)
-INCLUDE_DIRECTORIES(
- ${CMAKE_CURRENT_SOURCE_DIR}
- ${DRM_INCLUDE_PATH}
- ${GBE_INCLUDE_PATH}
- ${CMAKE_CURRENT_SOURCE_DIR}/../include)
+INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}
+ ${DRM_INCLUDE_PATH}
+ ${GBE_INCLUDE_PATH}
+ ${CMAKE_CURRENT_SOURCE_DIR}/../include)
SET(OPENCL_SRC
cl_api.c
${XFIXES_LIBRARY}
${DRM_LIBRARY})
-ADD_LIBRARY(cl_test STATIC
- tests/common.c
- tests/cl_test.c
- tests/cl_file_map.c)
-TARGET_LINK_LIBRARIES(cl_test cl)
-
-ADD_EXECUTABLE(test_write_only tests/test_write_only.c)
-ADD_EXECUTABLE(test_flat_address_space tests/test_flat_address_space.c)
-ADD_EXECUTABLE(test_copy_buffer tests/test_copy_buffer.c)
-ADD_EXECUTABLE(test_copy_buffer_row tests/test_copy_buffer_row.c)
-ADD_EXECUTABLE(test_eot tests/test_eot.c)
-TARGET_LINK_LIBRARIES(test_eot cl_test m)
-TARGET_LINK_LIBRARIES(test_write_only cl_test m)
-TARGET_LINK_LIBRARIES(test_flat_address_space cl_test m)
-TARGET_LINK_LIBRARIES(test_copy_buffer cl_test m)
-TARGET_LINK_LIBRARIES(test_copy_buffer_row cl_test m)
-
-#ADD_EXECUTABLE(test_copy_buffer tests/test_copy_buffer.c)
-#ADD_EXECUTABLE(test_copy_image tests/test_copy_image.c)
-#ADD_EXECUTABLE(test_enqueue_read tests/test_enqueue_read.c)
-#ADD_EXECUTABLE(test_imm_parameters tests/test_imm_parameters.c)
-#ADD_EXECUTABLE(test_2d_copy tests/test_2d_copy.c)
-#ADD_EXECUTABLE(test_barrier tests/test_barrier.c)
-#ADD_EXECUTABLE(test_static_local_memory tests/test_static_local_memory.c)
-#ADD_EXECUTABLE(test_local_memory tests/test_local_memory.c)
-#ADD_EXECUTABLE(test_private_memory tests/test_private_memory.c)
-#ADD_EXECUTABLE(test_constant_memory tests/test_constant_memory.c)
-#ADD_EXECUTABLE(test_memory_leak tests/test_memory_leak.c)
-#ADD_EXECUTABLE(mandelbrot tests/mandelbrot.c)
-#ADD_EXECUTABLE(mersenneTwister tests/mersenneTwister.c)
-#ADD_EXECUTABLE(blackscholes tests/blackscholes.c)
-#ADD_EXECUTABLE(matmul tests/matmul.c)
-#ADD_EXECUTABLE(urng tests/urng.c)
-#ADD_EXECUTABLE(fastWalsh tests/fastWalsh.c)
-#ADD_EXECUTABLE(fft tests/fft.c)
-#ADD_EXECUTABLE(dct tests/dct.c)
-#ADD_EXECUTABLE(binomialOption tests/binomialOption.c)
-#ADD_EXECUTABLE(nbody tests/nbody.c)
-#ADD_EXECUTABLE(svm_test tests/svm_test.c)
-#TARGET_LINK_LIBRARIES(test_copy_buffer cl_test m)
-#TARGET_LINK_LIBRARIES(test_copy_image cl_test m)
-#TARGET_LINK_LIBRARIES(test_enqueue_read cl_test m)
-#TARGET_LINK_LIBRARIES(test_imm_parameters cl_test m)
-#TARGET_LINK_LIBRARIES(test_2d_copy cl_test m)
-#TARGET_LINK_LIBRARIES(test_barrier cl_test m)
-#TARGET_LINK_LIBRARIES(test_static_local_memory cl_test m)
-#TARGET_LINK_LIBRARIES(test_local_memory cl_test m)
-#TARGET_LINK_LIBRARIES(test_private_memory cl_test m)
-#TARGET_LINK_LIBRARIES(test_constant_memory cl_test m)
-#TARGET_LINK_LIBRARIES(test_memory_leak cl_test m)
-#TARGET_LINK_LIBRARIES(mandelbrot cl_test m)
-#TARGET_LINK_LIBRARIES(mersenneTwister cl_test m)
-#TARGET_LINK_LIBRARIES(blackscholes cl_test m)
-#TARGET_LINK_LIBRARIES(matmul cl_test m)
-#TARGET_LINK_LIBRARIES(urng cl_test m)
-#TARGET_LINK_LIBRARIES(fastWalsh cl_test m)
-#TARGET_LINK_LIBRARIES(fft cl_test m)
-#TARGET_LINK_LIBRARIES(dct cl_test m)
-#TARGET_LINK_LIBRARIES(binomialOption cl_test m)
-#TARGET_LINK_LIBRARIES(nbody cl_test m)
-#TARGET_LINK_LIBRARIES(svm_test cl_test m)
-
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#if 0
+/* ============================================================
+
+Copyright (c) 2009 Advanced Micro Devices, Inc. All rights reserved.
+
+Redistribution and use of this material is permitted under the following
+conditions:
+
+Redistributions must retain the above copyright notice and all terms of this
+license.
+
+In no event shall anyone redistributing or accessing or using this material
+commence or participate in any arbitration or legal action relating to this
+material against Advanced Micro Devices, Inc. or any copyright holders or
+contributors. The foregoing shall survive any expiration or termination of
+this license or any agreement or access or use related to this material.
+
+ANY BREACH OF ANY TERM OF THIS LICENSE SHALL RESULT IN THE IMMEDIATE REVOCATION
+OF ALL RIGHTS TO REDISTRIBUTE, ACCESS OR USE THIS MATERIAL.
+
+THIS MATERIAL IS PROVIDED BY ADVANCED MICRO DEVICES, INC. AND ANY COPYRIGHT
+HOLDERS AND CONTRIBUTORS "AS IS" IN ITS CURRENT CONDITION AND WITHOUT ANY
+REPRESENTATIONS, GUARANTEE, OR WARRANTY OF ANY KIND OR IN ANY WAY RELATED TO
+SUPPORT, INDEMNITY, ERROR FREE OR UNINTERRUPTED OPERA TION, OR THAT IT IS FREE
+FROM DEFECTS OR VIRUSES. ALL OBLIGATIONS ARE HEREBY DISCLAIMED - WHETHER
+EXPRESS, IMPLIED, OR STATUTORY - INCLUDING, BUT NOT LIMITED TO, ANY IMPLIED
+WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE,
+ACCURACY, COMPLETENESS, OPERABILITY, QUALITY OF SERVICE, OR NON-INFRINGEMENT.
+IN NO EVENT SHALL ADVANCED MICRO DEVICES, INC. OR ANY COPYRIGHT HOLDERS OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, PUNITIVE,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, REVENUE, DATA, OR PROFITS; OR
+BUSINESS INTERRUPTION) HOWEVER CAUSED OR BASED ON ANY THEORY OF LIABILITY
+ARISING IN ANY WAY RELATED TO THIS MATERIAL, EVEN IF ADVISED OF THE POSSIBILITY
+OF SUCH DAMAGE. THE ENTIRE AND AGGREGATE LIABILITY OF ADVANCED MICRO DEVICES,
+INC. AND ANY COPYRIGHT HOLDERS AND CONTRIBUTORS SHALL NOT EXCEED TEN DOLLARS
+(US $10.00). ANYONE REDISTRIBUTING OR ACCESSING OR USING THIS MATERIAL ACCEPTS
+THIS ALLOCATION OF RISK AND AGREES TO RELEASE ADVANCED MICRO DEVICES, INC. AND
+ANY COPYRIGHT HOLDERS AND CONTRIBUTORS FROM ANY AND ALL LIABILITIES,
+OBLIGATIONS, CLAIMS, OR DEMANDS IN EXCESS OF TEN DOLLARS (US $10.00). THE
+FOREGOING ARE ESSENTIAL TERMS OF THIS LICENSE AND, IF ANY OF THESE TERMS ARE
+CONSTRUED AS UNENFORCEABLE, FAIL IN ESSENTIAL PURPOSE, OR BECOME VOID OR
+DETRIMENTAL TO ADVANCED MICRO DEVICES, INC. OR ANY COPYRIGHT HOLDERS OR
+CONTRIBUTORS FOR ANY REASON, THEN ALL RIGHTS TO REDISTRIBUTE, ACCESS OR USE
+THIS MATERIAL SHALL TERMINATE IMMEDIATELY. MOREOVER, THE FOREGOING SHALL
+SURVIVE ANY EXPIRATION OR TERMINATION OF THIS LICENSE OR ANY AGREEMENT OR
+ACCESS OR USE RELATED TO THIS MATERIAL.
+
+NOTICE IS HEREBY PROVIDED, AND BY REDISTRIBUTING OR ACCESSING OR USING THIS
+MATERIAL SUCH NOTICE IS ACKNOWLEDGED, THAT THIS MATERIAL MAY BE SUBJECT TO
+RESTRICTIONS UNDER THE LAWS AND REGULATIONS OF THE UNITED STATES OR OTHER
+COUNTRIES, WHICH INCLUDE BUT ARE NOT LIMITED TO, U.S. EXPORT CONTROL LAWS SUCH
+AS THE EXPORT ADMINISTRATION REGULATIONS AND NATIONAL SECURITY CONTROLS AS
+DEFINED THEREUNDER, AS WELL AS STATE DEPARTMENT CONTROLS UNDER THE U.S.
+MUNITIONS LIST. THIS MATERIAL MAY NOT BE USED, RELEASED, TRANSFERRED, IMPORTED,
+EXPORTED AND/OR RE-EXPORTED IN ANY MANNER PROHIBITED UNDER ANY APPLICABLE LAWS,
+INCLUDING U.S. EXPORT CONTROL LAWS REGARDING SPECIFICALLY DESIGNATED PERSONS,
+COUNTRIES AND NATIONALS OF COUNTRIES SUBJECT TO NATIONAL SECURITY CONTROLS.
+MOREOVER, THE FOREGOING SHALL SURVIVE ANY EXPIRATION OR TERMINATION OF ANY
+LICENSE OR AGREEMENT OR ACCESS OR USE RELATED TO THIS MATERIAL.
+
+NOTICE REGARDING THE U.S. GOVERNMENT AND DOD AGENCIES: This material is
+provided with "RESTRICTED RIGHTS" and/or "LIMITED RIGHTS" as applicable to
+computer software and technical data, respectively. Use, duplication,
+distribution or disclosure by the U.S. Government and/or DOD agencies is
+subject to the full extent of restrictions in all applicable regulations,
+including those found at FAR52.227 and DFARS252.227 et seq. and any successor
+regulations thereof. Use of this material by the U.S. Government and/or DOD
+agencies is acknowledgment of the proprietary rights of any copyright holders
+and contributors, including those of Advanced Micro Devices, Inc., as well as
+the provisions of FAR52.227-14 through 23 regarding privately developed and/or
+commercial computer software.
+
+This license forms the entire agreement regarding the subject matter hereof and
+supersedes all proposals and prior discussions and writings between the parties
+with respect thereto. This license does not affect any ownership, rights, title,
+or interest in, or relating to, this material. No terms of this license can be
+modified or waived, and no breach of this license can be excused, unless done
+so in a writing signed by all affected parties. Each term of this license is
+separately enforceable. If any term of this license is determined to be or
+becomes unenforceable or illegal, such term shall be reformed to the minimum
+extent necessary in order for this license to remain in effect in accordance
+with its terms as modified by such reformation. This license shall be governed
+by and construed in accordance with the laws of the State of Texas without
+regard to rules on conflicts of law of any state or jurisdiction or the United
+Nations Convention on the International Sale of Goods. All disputes arising out
+of this license shall be subject to the jurisdiction of the federal and state
+courts in Austin, Texas, and all defenses are hereby waived concerning personal
+jurisdiction and venue of these courts.
+
+============================================================ */
+
+
+#include "AESEncryptDecrypt.hpp"
+
+using namespace AES;
+
+int AESEncryptDecrypt::setupAESEncryptDecrypt()
+{
+ cl_uint sizeBytes = width*height*sizeof(cl_uchar);
+ input = (cl_uchar*)malloc(sizeBytes);
+ if(input == NULL)
+ {
+ sampleCommon->error("Failed to allocate host memory. (input)");
+ return SDK_FAILURE;
+ }
+
+ /* initialize the input array, do NOTHING but assignment when decrypt*/
+ int decrypt = 0;
+ if(!decrypt)
+ convertColorToGray(pixels, input);
+ else
+ convertGrayToGray(pixels, input);
+
+ /* 1 Byte = 8 bits */
+ keySize = keySizeBits/8;
+
+ /* due to unknown represenation of cl_uchar */
+ keySizeBits = keySize*sizeof(cl_uchar);
+
+ key = (cl_uchar*)malloc(keySizeBits);
+
+ /* random initialization of key */
+ sampleCommon->fillRandom<cl_uchar>(key, keySize, 1, 0, 255, seed);
+
+ /* expand the key */
+ explandedKeySize = (rounds+1)*keySize;
+ expandedKey = (cl_uchar*)malloc(explandedKeySize*sizeof(cl_uchar));
+ roundKey = (cl_uchar*)malloc(explandedKeySize*sizeof(cl_uchar));
+
+ keyExpansion(key, expandedKey, keySize, explandedKeySize);
+ for(cl_uint i=0; i< rounds+1; ++i)
+ {
+ createRoundKey(expandedKey + keySize*i, roundKey + keySize*i);
+ }
+
+ output = (cl_uchar*)malloc(sizeBytes);
+ if(output == NULL)
+ {
+ sampleCommon->error("Failed to allocate host memory. (output)");
+ return SDK_FAILURE;
+ }
+
+ if(!quiet)
+ {
+ if(decrypt)
+ {
+ std::cout << "Decrypting Image ...." << std::endl;
+ }
+ else
+ {
+ std::cout << "Encrypting Image ...." << std::endl;
+ }
+
+ std::cout << "Input Image : " << inFilename << std::endl;
+ std::cout << "Key : ";
+ for(cl_uint i=0; i < keySize; ++i)
+ {
+ std::cout << (cl_uint)key[i] << " ";
+ }
+ std::cout << std::endl;
+ }
+
+ return SDK_SUCCESS;
+}
+
+void
+AESEncryptDecrypt::convertColorToGray(const uchar4 *pixels, cl_uchar *gray)
+{
+ for(cl_int i=0; i< height; ++i)
+ for(cl_int j=0; j<width; ++j)
+ {
+ cl_uint index = i*width + j;
+ // gray = (0.3*R + 0.59*G + 0.11*B)
+ gray[index] = cl_uchar (pixels[index].x * 0.3 +
+ pixels[index].y * 0.59 +
+ pixels[index].z * 0.11 );
+ }
+}
+
+void
+AESEncryptDecrypt::convertGrayToGray(const uchar4 *pixels, cl_uchar *gray)
+{
+ for(cl_int i=0; i< height; ++i)
+ for(cl_int j=0; j<width; ++j)
+ {
+ cl_uint index = i*width + j;
+ gray[index] = pixels[index].x;
+ }
+}
+
+void
+AESEncryptDecrypt::convertGrayToPixels(const cl_uchar *gray, uchar4 *pixels)
+{
+ for(cl_int i=0; i< height; ++i)
+ for(cl_int j=0; j<width; ++j)
+ {
+ cl_uint index = i*width + j;
+ pixels[index].x = gray[index];
+ pixels[index].y = gray[index];
+ pixels[index].z = gray[index];
+ }
+}
+
+int
+AESEncryptDecrypt::genBinaryImage()
+{
+ cl_int status = CL_SUCCESS;
+
+ /*
+ * Have a look at the available platforms and pick either
+ * the AMD one if available or a reasonable default.
+ */
+ cl_uint numPlatforms;
+ cl_platform_id platform = NULL;
+ status = clGetPlatformIDs(0, NULL, &numPlatforms);
+ if(!sampleCommon->checkVal(status,
+ CL_SUCCESS,
+ "clGetPlatformIDs failed."))
+ {
+ return SDK_FAILURE;
+ }
+ if (0 < numPlatforms)
+ {
+ cl_platform_id* platforms = new cl_platform_id[numPlatforms];
+ status = clGetPlatformIDs(numPlatforms, platforms, NULL);
+ if(!sampleCommon->checkVal(status,
+ CL_SUCCESS,
+ "clGetPlatformIDs failed."))
+ {
+ return SDK_FAILURE;
+ }
+
+ char platformName[100];
+ for (unsigned i = 0; i < numPlatforms; ++i)
+ {
+ status = clGetPlatformInfo(platforms[i],
+ CL_PLATFORM_VENDOR,
+ sizeof(platformName),
+ platformName,
+ NULL);
+
+ if(!sampleCommon->checkVal(status,
+ CL_SUCCESS,
+ "clGetPlatformInfo failed."))
+ {
+ return SDK_FAILURE;
+ }
+
+ platform = platforms[i];
+ if (!strcmp(platformName, "Advanced Micro Devices, Inc."))
+ {
+ break;
+ }
+ }
+ std::cout << "Platform found : " << platformName << "\n";
+ delete[] platforms;
+ }
+
+ if(NULL == platform)
+ {
+ sampleCommon->error("NULL platform found so Exiting Application.");
+ return SDK_FAILURE;
+ }
+
+ /*
+ * If we could find our platform, use it. Otherwise use just available platform.
+ */
+ cl_context_properties cps[5] =
+ {
+ CL_CONTEXT_PLATFORM,
+ (cl_context_properties)platform,
+ CL_CONTEXT_OFFLINE_DEVICES_AMD,
+ (cl_context_properties)1,
+ 0
+ };
+
+ context = clCreateContextFromtype(cps,
+ CL_DEVICE_TYPE_ALL,
+ NULL,
+ NULL,
+ &status);
+
+ if(!sampleCommon->checkVal(status,
+ CL_SUCCESS,
+ "clCreateContextFromtype failed."))
+ {
+ return SDK_FAILURE;
+ }
+
+ /* create a CL program using the kernel source */
+ streamsdk::SDKFile kernelFile;
+ std::string kernelPath = sampleCommon->getPath();
+ kernelPath.append("AESEncryptDecrypt_Kernels.cl");
+ if(!kernelFile.open(kernelPath.c_str()))
+ {
+ std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
+ return SDK_FAILURE;
+ }
+ const char * source = kernelFile.source().c_str();
+ size_t sourceSize[] = {strlen(source)};
+ program = clCreateProgramWithSource(context,
+ 1,
+ &source,
+ sourceSize,
+ &status);
+ if(!sampleCommon->checkVal(status,
+ CL_SUCCESS,
+ "clCreateProgramWithSource failed."))
+ {
+ return SDK_FAILURE;
+ }
+
+ /* create a cl program executable for all the devices specified */
+ status = clBuildProgram(program,
+ 0,
+ NULL,
+ NULL,
+ NULL,
+ NULL);
+
+ size_t numDevices;
+ status = clGetProgramInfo(program,
+ CL_PROGRAM_NUM_DEVICES,
+ sizeof(numDevices),
+ &numDevices,
+ NULL );
+ if(!sampleCommon->checkVal(status,
+ CL_SUCCESS,
+ "clGetProgramInfo(CL_PROGRAM_NUM_DEVICES) failed."))
+ {
+ return SDK_FAILURE;
+ }
+
+ std::cout << "Number of devices found : " << numDevices << "\n\n";
+ devices = (cl_device_id *)malloc( sizeof(cl_device_id) * numDevices );
+ if(devices == NULL)
+ {
+ sampleCommon->error("Failed to allocate host memory.(devices)");
+ return SDK_FAILURE;
+ }
+ /* grab the handles to all of the devices in the program. */
+ status = clGetProgramInfo(program,
+ CL_PROGRAM_DEVICES,
+ sizeof(cl_device_id) * numDevices,
+ devices,
+ NULL );
+ if(!sampleCommon->checkVal(status,
+ CL_SUCCESS,
+ "clGetProgramInfo(CL_PROGRAM_DEVICES) failed."))
+ {
+ return SDK_FAILURE;
+ }
+
+
+ /* figure out the sizes of each of the binaries. */
+ size_t *binarySizes = (size_t*)malloc( sizeof(size_t) * numDevices );
+ if(devices == NULL)
+ {
+ sampleCommon->error("Failed to allocate host memory.(binarySizes)");
+ return SDK_FAILURE;
+ }
+
+ status = clGetProgramInfo(program,
+ CL_PROGRAM_BINARY_SIZES,
+ sizeof(size_t) * numDevices,
+ binarySizes, NULL);
+ if(!sampleCommon->checkVal(status,
+ CL_SUCCESS,
+ "clGetProgramInfo(CL_PROGRAM_BINARY_SIZES) failed."))
+ {
+ return SDK_FAILURE;
+ }
+
+ size_t i = 0;
+ /* copy over all of the generated binaries. */
+ char **binaries = (char **)malloc( sizeof(char *) * numDevices );
+ if(binaries == NULL)
+ {
+ sampleCommon->error("Failed to allocate host memory.(binaries)");
+ return SDK_FAILURE;
+ }
+
+ for(i = 0; i < numDevices; i++)
+ {
+ if(binarySizes[i] != 0)
+ {
+ binaries[i] = (char *)malloc( sizeof(char) * binarySizes[i]);
+ if(binaries[i] == NULL)
+ {
+ sampleCommon->error("Failed to allocate host memory.(binaries[i])");
+ return SDK_FAILURE;
+ }
+ }
+ else
+ {
+ binaries[i] = NULL;
+ }
+ }
+ status = clGetProgramInfo(program,
+ CL_PROGRAM_BINARIES,
+ sizeof(char *) * numDevices,
+ binaries,
+ NULL);
+ if(!sampleCommon->checkVal(status,
+ CL_SUCCESS,
+ "clGetProgramInfo(CL_PROGRAM_BINARIES) failed."))
+ {
+ return SDK_FAILURE;
+ }
+
+ /* dump out each binary into its own separate file. */
+ for(i = 0; i < numDevices; i++)
+ {
+ char fileName[100];
+ sprintf(fileName, "%s.%d", dumpBinary.c_str(), (int)i);
+ if(binarySizes[i] != 0)
+ {
+ char deviceName[1024];
+ status = clGetDeviceInfo(devices[i],
+ CL_DEVICE_NAME,
+ sizeof(deviceName),
+ deviceName,
+ NULL);
+ if(!sampleCommon->checkVal(status,
+ CL_SUCCESS,
+ "clGetDeviceInfo(CL_DEVICE_NAME) failed."))
+ {
+ return SDK_FAILURE;
+ }
+
+ printf( "%s binary kernel: %s\n", deviceName, fileName);
+ streamsdk::SDKFile BinaryFile;
+ if(!BinaryFile.writeBinaryToFile(fileName,
+ binaries[i],
+ binarySizes[i]))
+ {
+ std::cout << "Failed to load kernel file : " << fileName << std::endl;
+ return SDK_FAILURE;
+ }
+ }
+ else
+ {
+ printf("Skipping %s since there is no binary data to write!\n",
+ fileName);
+ }
+ }
+
+ // Release all resouces and memory
+ for(i = 0; i < numDevices; i++)
+ {
+ if(binaries[i] != NULL)
+ {
+ free(binaries[i]);
+ binaries[i] = NULL;
+ }
+ }
+
+ if(binaries != NULL)
+ {
+ free(binaries);
+ binaries = NULL;
+ }
+
+ if(binarySizes != NULL)
+ {
+ free(binarySizes);
+ binarySizes = NULL;
+ }
+
+ if(devices != NULL)
+ {
+ free(devices);
+ devices = NULL;
+ }
+
+ status = clReleaseProgram(program);
+ if(!sampleCommon->checkVal(status,
+ CL_SUCCESS,
+ "clReleaseProgram failed."))
+ {
+ return SDK_FAILURE;
+ }
+
+ status = clReleaseContext(context);
+ if(!sampleCommon->checkVal(status,
+ CL_SUCCESS,
+ "clReleaseContext failed."))
+ {
+ return SDK_FAILURE;
+ }
+
+ return SDK_SUCCESS;
+}
+
+
+int
+AESEncryptDecrypt::setupCL(void)
+{
+ cl_int status = 0;
+ size_t deviceListSize;
+
+ cl_device_type dtype;
+
+ if(devicetype.compare("cpu") == 0)
+ {
+ dtype = CL_DEVICE_TYPE_CPU;
+ }
+ else //devicetype = "gpu"
+ {
+ dtype = CL_DEVICE_TYPE_GPU;
+ }
+
+ /*
+ * Have a look at the available platforms and pick either
+ * the AMD one if available or a reasonable default.
+ */
+
+ cl_uint numPlatforms;
+ cl_platform_id platform = NULL;
+ status = clGetPlatformIDs(0, NULL, &numPlatforms);
+ if(!sampleCommon->checkVal(status,
+ CL_SUCCESS,
+ "clGetPlatformIDs failed."))
+ {
+ return SDK_FAILURE;
+ }
+ if (0 < numPlatforms)
+ {
+ cl_platform_id* platforms = new cl_platform_id[numPlatforms];
+ status = clGetPlatformIDs(numPlatforms, platforms, NULL);
+ if(!sampleCommon->checkVal(status,
+ CL_SUCCESS,
+ "clGetPlatformIDs failed."))
+ {
+ return SDK_FAILURE;
+ }
+ for (unsigned i = 0; i < numPlatforms; ++i)
+ {
+ char pbuf[100];
+ status = clGetPlatformInfo(platforms[i],
+ CL_PLATFORM_VENDOR,
+ sizeof(pbuf),
+ pbuf,
+ NULL);
+
+ if(!sampleCommon->checkVal(status,
+ CL_SUCCESS,
+ "clGetPlatformInfo failed."))
+ {
+ return SDK_FAILURE;
+ }
+
+ platform = platforms[i];
+ if (!strcmp(pbuf, "Advanced Micro Devices, Inc."))
+ {
+ break;
+ }
+ }
+ delete[] platforms;
+ }
+
+ if(NULL == platform)
+ {
+ sampleCommon->error("NULL platform found so Exiting Application.");
+ return SDK_FAILURE;
+ }
+
+ // Display available devices.
+ if(!sampleCommon->displayDevices(platform, dtype))
+ {
+ sampleCommon->error("sampleCommon::displayDevices() failed");
+ return SDK_FAILURE;
+ }
+
+ /*
+ * If we could find our platform, use it. Otherwise use just available platform.
+ */
+ cl_context_properties cps[3] =
+ {
+ CL_CONTEXT_PLATFORM,
+ (cl_context_properties)platform,
+ 0
+ };
+
+ context = clCreateContextFromtype(
+ cps,
+ dtype,
+ NULL,
+ NULL,
+ &status);
+
+ if(!sampleCommon->checkVal(status,
+ CL_SUCCESS,
+ "clCreateContextFromtype failed."))
+ return SDK_FAILURE;
+
+ /* First, get the size of device list data */
+ status = clGetContextInfo(
+ context,
+ CL_CONTEXT_DEVICES,
+ 0,
+ NULL,
+ &deviceListSize);
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clGetContextInfo failed."))
+ return SDK_FAILURE;
+
+ int devicecount = (int)(deviceListSize / sizeof(cl_device_id));
+ if(!sampleCommon->validateDeviceId(deviceId, devicecount))
+ {
+ sampleCommon->error("sampleCommon::validateDeviceId() failed");
+ return SDK_FAILURE;
+ }
+
+ /* Now allocate memory for device list based on the size we got earlier */
+ devices = (cl_device_id *)malloc(deviceListSize);
+ if(devices == NULL)
+ {
+ sampleCommon->error("Failed to allocate memory (devices).");
+ return SDK_FAILURE;
+ }
+
+ /* Now, get the device list data */
+ status = clGetContextInfo(
+ context,
+ CL_CONTEXT_DEVICES,
+ deviceListSize,
+ devices,
+ NULL);
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clGetGetContextInfo failed."))
+ return SDK_FAILURE;
+
+ /* Get Device specific Information */
+ status = clGetDeviceInfo(
+ devices[deviceId],
+ CL_DEVICE_MAX_WORK_GROUP_SIZE,
+ sizeof(size_t),
+ (void *)&maxWorkGroupSize,
+ NULL);
+
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clGetDeviceInfo CL_DEVICE_MAX_WORK_GROUP_SIZE failed."))
+ return SDK_FAILURE;
+
+
+ status = clGetDeviceInfo(
+ devices[deviceId],
+ CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS,
+ sizeof(cl_uint),
+ (void *)&maxDimensions,
+ NULL);
+
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS failed."))
+ return SDK_FAILURE;
+
+
+ maxWorkItemSizes = (size_t *)malloc(maxDimensions*sizeof(size_t));
+
+ status = clGetDeviceInfo(
+ devices[deviceId],
+ CL_DEVICE_MAX_WORK_ITEM_SIZES,
+ sizeof(size_t)*maxDimensions,
+ (void *)maxWorkItemSizes,
+ NULL);
+
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_SIZES failed."))
+ return SDK_FAILURE;
+
+
+ status = clGetDeviceInfo(
+ devices[deviceId],
+ CL_DEVICE_LOCAL_MEM_SIZE,
+ sizeof(cl_ulong),
+ (void *)&totalLocalMemory,
+ NULL);
+
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clGetDeviceInfo CL_DEVICE_LOCAL_MEM_SIZES failed."))
+ return SDK_FAILURE;
+
+
+ {
+ /* The block is to move the declaration of prop closer to its use */
+ cl_command_queue_properties prop = 0;
+ if(timing)
+ prop |= CL_QUEUE_PROFILING_ENABLE;
+
+ commandQueue = clCreateCommandQueue(
+ context,
+ devices[deviceId],
+ prop,
+ &status);
+ if(!sampleCommon->checkVal(
+ status,
+ 0,
+ "clCreateCommandQueue failed."))
+ return SDK_FAILURE;
+ }
+ inputBuffer = clCreateBuffer(
+ context,
+ CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+ sizeof(cl_uchar ) * width * height,
+ input,
+ &status);
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clCreateBuffer failed. (inputBuffer)"))
+ return SDK_FAILURE;
+
+ outputBuffer = clCreateBuffer(
+ context,
+ CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
+ sizeof(cl_uchar ) * width * height,
+ output,
+ &status);
+
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clCreateBuffer failed. (outputBuffer)"))
+ return SDK_FAILURE;
+
+ rKeyBuffer = clCreateBuffer(
+ context,
+ CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+ sizeof(cl_uchar ) * explandedKeySize,
+ roundKey,
+ &status);
+
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clCreateBuffer failed. (rKeyBuffer)"))
+ return SDK_FAILURE;
+
+ cl_uchar * sBox;
+ sBox = (cl_uchar *)sbox;
+ sBoxBuffer = clCreateBuffer(
+ context,
+ CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+ sizeof(cl_uchar ) * 256,
+ sBox,
+ &status);
+
+ cl_uchar * rsBox;
+ rsBox = (cl_uchar *)rsbox;
+ rsBoxBuffer = clCreateBuffer(
+ context,
+ CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+ sizeof(cl_uchar ) * 256,
+ rsBox,
+ &status);
+
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clCreateBuffer failed. (sBoxBuffer)"))
+ return SDK_FAILURE;
+
+ /* create a CL program using the kernel source */
+ streamsdk::SDKFile kernelFile;
+ std::string kernelPath = sampleCommon->getPath();
+
+ if(isLoadBinaryEnabled())
+ {
+ kernelPath.append(loadBinary.c_str());
+ if(!kernelFile.readBinaryFromFile(kernelPath.c_str()))
+ {
+ std::cout << "Failed to load kernel file : " << kernelPath << std::endl;
+ return SDK_FAILURE;
+ }
+
+ const char * binary = kernelFile.source().c_str();
+ size_t binarySize = kernelFile.source().size();
+ program = clCreateProgramWithBinary(context,
+ 1,
+ &devices[deviceId],
+ (const size_t *)&binarySize,
+ (const unsigned char**)&binary,
+ NULL,
+ &status);
+ if(!sampleCommon->checkVal(status,
+ CL_SUCCESS,
+ "clCreateProgramWithBinary failed."))
+ {
+ return SDK_FAILURE;
+ }
+
+ }
+ else
+ {
+ kernelPath.append("AESEncryptDecrypt_Kernels.cl");
+ if(!kernelFile.open(kernelPath.c_str()))
+ {
+ std::cout << "Failed to load kernel file: " << kernelPath << std::endl;
+ return SDK_FAILURE;
+ }
+ const char * source = kernelFile.source().c_str();
+ size_t sourceSize[] = { strlen(source) };
+ program = clCreateProgramWithSource(
+ context,
+ 1,
+ &source,
+ sourceSize,
+ &status);
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clCreateProgramWithSource failed."))
+ return SDK_FAILURE;
+ }
+
+ /* create a cl program executable for all the devices specified */
+ status = clBuildProgram(program, 1, &devices[deviceId], NULL, NULL, NULL);
+ if(status != CL_SUCCESS)
+ {
+ if(status == CL_BUILD_PROGRAM_FAILURE)
+ {
+ cl_int logStatus;
+ char * buildLog = NULL;
+ size_t buildLogSize = 0;
+ logStatus = clGetProgramBuildInfo(program,
+ devices[deviceId],
+ CL_PROGRAM_BUILD_LOG,
+ buildLogSize,
+ buildLog,
+ &buildLogSize);
+ if(!sampleCommon->checkVal(logStatus,
+ CL_SUCCESS,
+ "clGetProgramBuildInfo failed."))
+ {
+ return SDK_FAILURE;
+ }
+
+ buildLog = (char*)malloc(buildLogSize);
+ if(buildLog == NULL)
+ {
+ sampleCommon->error("Failed to allocate host memory. (buildLog)");
+ return SDK_FAILURE;
+ }
+ memset(buildLog, 0, buildLogSize);
+
+ logStatus = clGetProgramBuildInfo(program,
+ devices[deviceId],
+ CL_PROGRAM_BUILD_LOG,
+ buildLogSize,
+ buildLog,
+ NULL);
+ if(!sampleCommon->checkVal(logStatus,
+ CL_SUCCESS,
+ "clGetProgramBuildInfo failed."))
+ {
+ free(buildLog);
+ return SDK_FAILURE;
+ }
+
+ std::cout << " \n\t\t\tBUILD LOG\n";
+ std::cout << " ************************************************\n";
+ std::cout << buildLog << std::endl;
+ std::cout << " ************************************************\n";
+ free(buildLog);
+ }
+
+ if(!sampleCommon->checkVal(status,
+ CL_SUCCESS,
+ "clBuildProgram failed."))
+ {
+ return SDK_FAILURE;
+ }
+ }
+
+ /* get a kernel object handle for a kernel with the given name */
+ if(decrypt)
+ {
+ kernel = clCreateKernel(program, "AESDecrypt", &status);
+ }
+ else
+ {
+ kernel = clCreateKernel(program, "AESEncrypt", &status);
+ }
+
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clCreateKernel failed."))
+ return SDK_FAILURE;
+
+ return SDK_SUCCESS;
+}
+
+
+int
+AESEncryptDecrypt::runCLKernels(void)
+{
+ cl_int status;
+ cl_event events[2];
+
+ size_t globalThreads[2]= {width/4, height};
+ size_t localThreads[2] = {1, 4};
+
+ status = clGetKernelWorkGroupInfo(
+ kernel,
+ devices[deviceId],
+ CL_KERNEL_LOCAL_MEM_SIZE,
+ sizeof(cl_ulong),
+ &usedLocalMemory,
+ NULL);
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clGetKernelWorkGroupInfo failed.(usedLocalMemory)"))
+ return SDK_FAILURE;
+
+ availableLocalMemory = totalLocalMemory - usedLocalMemory;
+
+ /* two local memories buffers of sizeof(cl_uchar)*keySize */
+ neededLocalMemory = 2*sizeof(cl_uchar)*keySize;
+
+ if(neededLocalMemory > availableLocalMemory)
+ {
+ std::cout << "Unsupported: Insufficient local memory on device." << std::endl;
+ return SDK_SUCCESS;
+ }
+
+ if(localThreads[0] > maxWorkItemSizes[0] ||
+ localThreads[1] > maxWorkItemSizes[1] ||
+ localThreads[0]*localThreads[1] > maxWorkGroupSize)
+ {
+ std::cout << "Unsupported: Device does not support requested number of work items."<<std::endl;
+ return SDK_SUCCESS;
+ }
+
+ /* Check group size against kernelWorkGroupSize */
+ status = clGetKernelWorkGroupInfo(kernel,
+ devices[deviceId],
+ CL_KERNEL_WORK_GROUP_SIZE,
+ sizeof(size_t),
+ &kernelWorkGroupSize,
+ 0);
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clGetKernelWorkGroupInfo failed."))
+ {
+ return SDK_FAILURE;
+ }
+
+ if((cl_uint)(localThreads[0]*localThreads[1]) > kernelWorkGroupSize )
+ {
+ std::cout << "Out of Resources!" << std::endl;
+ std::cout << "Group Size specified : " << localThreads[0] * localThreads[1] << std::endl;
+ std::cout << "Max Group Size supported on the kernel : "
+ << kernelWorkGroupSize << std::endl;
+ return SDK_FAILURE;
+ }
+
+ /*** Set appropriate arguments to the kernel ***/
+ status = clSetKernelArg(
+ kernel,
+ 0,
+ sizeof(cl_mem),
+ (void *)&outputBuffer);
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clSetKernelArg failed. (outputBuffer)"))
+ return SDK_FAILURE;
+
+ status = clSetKernelArg(
+ kernel,
+ 1,
+ sizeof(cl_mem),
+ (void *)&inputBuffer);
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clSetKernelArg failed. (inputBuffer)"))
+ return SDK_FAILURE;
+
+ status = clSetKernelArg(
+ kernel,
+ 2,
+ sizeof(cl_mem),
+ (void *)&rKeyBuffer);
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clSetKernelArg failed. (rKeyBuffer)"))
+ return SDK_FAILURE;
+
+ if(decrypt)
+ {
+ status = clSetKernelArg(
+ kernel,
+ 3,
+ sizeof(cl_mem),
+ (void *)&rsBoxBuffer);
+ }
+ else
+ {
+ status = clSetKernelArg(
+ kernel,
+ 3,
+ sizeof(cl_mem),
+ (void *)&sBoxBuffer);
+ }
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clSetKernelArg failed. (SBoxBuffer)"))
+ return SDK_FAILURE;
+
+ status = clSetKernelArg(
+ kernel,
+ 4,
+ sizeof(cl_uchar)*keySize,
+ NULL);
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clSetKernelArg failed. (block0)"))
+ return SDK_FAILURE;
+
+ status = clSetKernelArg(
+ kernel,
+ 5,
+ sizeof(cl_uchar)*keySize,
+ NULL);
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clSetKernelArg failed. (block1)"))
+ return SDK_FAILURE;
+
+ status = clSetKernelArg(
+ kernel,
+ 6,
+ sizeof(cl_uint),
+ (void *)&width);
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clSetKernelArg failed. (width)"))
+ return SDK_FAILURE;
+
+ status = clSetKernelArg(
+ kernel,
+ 7,
+ sizeof(cl_uint),
+ (void *)&rounds);
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clSetKernelArg failed. (rounds)"))
+ return SDK_FAILURE;
+
+
+ /*
+ * Enqueue a kernel run call.
+ */
+ status = clEnqueueNDRangeKernel(
+ commandQueue,
+ kernel,
+ 2,
+ NULL,
+ globalThreads,
+ localThreads,
+ 0,
+ NULL,
+ &events[0]);
+
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clEnqueueNDRangeKernel failed."))
+ return SDK_FAILURE;
+
+
+ /* wait for the kernel call to finish execution */
+ status = clWaitForEvents(1, &events[0]);
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clWaitForEvents failed."))
+ return SDK_FAILURE;
+
+ /* Enqueue the results to application pointer*/
+ status = clEnqueueReadBuffer(
+ commandQueue,
+ outputBuffer,
+ CL_TRUE,
+ 0,
+ width * height * sizeof(cl_uchar),
+ output,
+ 0,
+ NULL,
+ &events[1]);
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clEnqueueReadBuffer failed."))
+ return SDK_FAILURE;
+
+ /* Wait for the read buffer to finish execution */
+ status = clWaitForEvents(1, &events[1]);
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clWaitForEvents failed."))
+ return SDK_FAILURE;
+
+ clReleaseEvent(events[0]);
+ clReleaseEvent(events[1]);
+
+ return SDK_SUCCESS;
+}
+
+cl_uchar
+AESEncryptDecrypt::getRconValue(cl_uint num)
+{
+ return Rcon[num];
+}
+
+void
+AESEncryptDecrypt::rotate(cl_uchar * word)
+{
+ cl_uchar c = word[0];
+ for(cl_uint i=0; i<3; ++i)
+ {
+ word[i] = word[i+1];
+ }
+ word[3] = c;
+}
+
+void
+AESEncryptDecrypt::core(cl_uchar * word, cl_uint iter)
+{
+ rotate(word);
+
+ for(cl_uint i=0; i < 4; ++i)
+ {
+ word[i] = getSBoxValue(word[i]);
+ }
+
+ word[0] = word[0]^getRconValue(iter);
+}
+
+void
+AESEncryptDecrypt::keyExpansion(cl_uchar * key, cl_uchar * expandedKey,
+ cl_uint keySize, cl_uint explandedKeySize)
+{
+ cl_uint currentSize = 0;
+ cl_uint rConIteration = 1;
+ cl_uchar temp[4] = {0};
+
+ for(cl_uint i=0; i < keySize; ++i)
+ {
+ expandedKey[i] = key[i];
+ }
+
+ currentSize += keySize;
+
+ while(currentSize < explandedKeySize)
+ {
+ for(cl_uint i=0; i < 4; ++i)
+ {
+ temp[i] = expandedKey[(currentSize - 4) + i];
+ }
+
+ if(currentSize%keySize == 0)
+ {
+ core(temp, rConIteration++);
+ }
+
+ //XXX: add extra SBOX here if the keySize is 32 Bytes
+
+ for(cl_uint i=0; i < 4; ++i)
+ {
+ expandedKey[currentSize] = expandedKey[currentSize - keySize]^temp[i];
+ currentSize++;
+ }
+ }
+}
+
+cl_uchar
+AESEncryptDecrypt::getSBoxValue(cl_uint num)
+{
+ return sbox[num];
+}
+
+cl_uchar
+AESEncryptDecrypt::getSBoxInvert(cl_uint num)
+{
+ return rsbox[num];
+}
+
+cl_uchar
+AESEncryptDecrypt::galoisMultiplication(cl_uchar a, cl_uchar b)
+{
+ cl_uchar p = 0;
+ for(cl_uint i=0; i < 8; ++i)
+ {
+ if((b&1) == 1)
+ {
+ p^=a;
+ }
+ cl_uchar hiBitSet = (a & 0x80);
+ a <<= 1;
+ if(hiBitSet == 0x80)
+ {
+ a ^= 0x1b;
+ }
+ b >>= 1;
+ }
+ return p;
+}
+
+void
+AESEncryptDecrypt::mixColumn(cl_uchar *column)
+{
+ cl_uchar cpy[4];
+ for(cl_uint i=0; i < 4; ++i)
+ {
+ cpy[i] = column[i];
+ }
+ column[0] = galoisMultiplication(cpy[0], 2)^
+ galoisMultiplication(cpy[3], 1)^
+ galoisMultiplication(cpy[2], 1)^
+ galoisMultiplication(cpy[1], 3);
+
+ column[1] = galoisMultiplication(cpy[1], 2)^
+ galoisMultiplication(cpy[0], 1)^
+ galoisMultiplication(cpy[3], 1)^
+ galoisMultiplication(cpy[2], 3);
+
+ column[2] = galoisMultiplication(cpy[2], 2)^
+ galoisMultiplication(cpy[1], 1)^
+ galoisMultiplication(cpy[0], 1)^
+ galoisMultiplication(cpy[3], 3);
+
+ column[3] = galoisMultiplication(cpy[3], 2)^
+ galoisMultiplication(cpy[2], 1)^
+ galoisMultiplication(cpy[1], 1)^
+ galoisMultiplication(cpy[0], 3);
+}
+
+void
+AESEncryptDecrypt::mixColumnInv(cl_uchar *column)
+{
+ cl_uchar cpy[4];
+ for(cl_uint i=0; i < 4; ++i)
+ {
+ cpy[i] = column[i];
+ }
+ column[0] = galoisMultiplication(cpy[0], 14 )^
+ galoisMultiplication(cpy[3], 9 )^
+ galoisMultiplication(cpy[2], 13)^
+ galoisMultiplication(cpy[1], 11);
+
+ column[1] = galoisMultiplication(cpy[1], 14 )^
+ galoisMultiplication(cpy[0], 9 )^
+ galoisMultiplication(cpy[3], 13)^
+ galoisMultiplication(cpy[2], 11);
+
+ column[2] = galoisMultiplication(cpy[2], 14 )^
+ galoisMultiplication(cpy[1], 9 )^
+ galoisMultiplication(cpy[0], 13)^
+ galoisMultiplication(cpy[3], 11);
+
+ column[3] = galoisMultiplication(cpy[3], 14 )^
+ galoisMultiplication(cpy[2], 9 )^
+ galoisMultiplication(cpy[1], 13)^
+ galoisMultiplication(cpy[0], 11);
+}
+
+void
+AESEncryptDecrypt::mixColumns(cl_uchar * state, cl_bool inverse)
+{
+ cl_uchar column[4];
+ for(cl_uint i=0; i < 4; ++i)
+ {
+ for(cl_uint j=0; j < 4; ++j)
+ {
+ column[j] = state[j*4 + i];
+ }
+
+ if(inverse)
+ {
+ mixColumnInv(column);
+ }
+ else
+ {
+ mixColumn(column);
+ }
+
+ for(cl_uint j=0; j < 4; ++j)
+ {
+ state[j*4 + i] = column[j];
+ }
+ }
+}
+
+void
+AESEncryptDecrypt::subBytes(cl_uchar * state, cl_bool inverse)
+{
+ for(cl_uint i=0; i < keySize; ++i)
+ {
+ state[i] = inverse ? getSBoxInvert(state[i]): getSBoxValue(state[i]);
+ }
+}
+
+void
+AESEncryptDecrypt::shiftRow(cl_uchar *state, cl_uchar nbr)
+{
+ for(cl_uint i=0; i < nbr; ++i)
+ {
+ cl_uchar tmp = state[0];
+ for(cl_uint j = 0; j < 3; ++j)
+ {
+ state[j] = state[j+1];
+ }
+ state[3] = tmp;
+ }
+}
+
+void
+AESEncryptDecrypt::shiftRowInv(cl_uchar *state, cl_uchar nbr)
+{
+ for(cl_uint i=0; i < nbr; ++i)
+ {
+ cl_uchar tmp = state[3];
+ for(cl_uint j = 3; j > 0; --j)
+ {
+ state[j] = state[j-1];
+ }
+ state[0] = tmp;
+ }
+}
+void
+AESEncryptDecrypt::shiftRows(cl_uchar * state, cl_bool inverse)
+{
+ for(cl_uint i=0; i < 4; ++i)
+ {
+ if(inverse)
+ shiftRowInv(state + i*4, i);
+ else
+ shiftRow(state + i*4, i);
+ }
+}
+
+void
+AESEncryptDecrypt::addRoundKey(cl_uchar * state, cl_uchar * rKey)
+{
+ for(cl_uint i=0; i < keySize; ++i)
+ {
+ state[i] = state[i] ^ rKey[i];
+ }
+}
+
+void
+AESEncryptDecrypt::createRoundKey(cl_uchar * eKey, cl_uchar * rKey)
+{
+ for(cl_uint i=0; i < 4; ++i)
+ for(cl_uint j=0; j < 4; ++j)
+ {
+ rKey[i+ j*4] = eKey[i*4 + j];
+ }
+}
+
+void
+AESEncryptDecrypt::aesRound(cl_uchar * state, cl_uchar * rKey)
+{
+ subBytes(state, decrypt);
+ shiftRows(state, decrypt);
+ mixColumns(state, decrypt);
+ addRoundKey(state, rKey);
+}
+
+void
+AESEncryptDecrypt::aesMain(cl_uchar * state, cl_uchar * rKey, cl_uint rounds)
+{
+ addRoundKey(state, rKey);
+ for(cl_uint i=1; i < rounds; ++i)
+ {
+ aesRound(state, rKey + keySize*i);
+ }
+ subBytes(state, decrypt);
+ shiftRows(state, decrypt);
+ addRoundKey(state, rKey + keySize*rounds);
+}
+
+void
+AESEncryptDecrypt::aesRoundInv(cl_uchar * state, cl_uchar * rKey)
+{
+ shiftRows(state, decrypt);
+ subBytes(state, decrypt);
+ addRoundKey(state, rKey);
+ mixColumns(state, decrypt);
+}
+
+void
+AESEncryptDecrypt::aesMainInv(cl_uchar * state, cl_uchar * rKey, cl_uint rounds)
+{
+ addRoundKey(state, rKey + keySize*rounds);
+ for(cl_uint i=rounds-1; i > 0; --i)
+ {
+ aesRoundInv(state, rKey + keySize*i);
+ }
+ shiftRows(state, decrypt);
+ subBytes(state, decrypt);
+ addRoundKey(state, rKey);
+}
+
+/**
+ *
+ *
+ */
+void
+AESEncryptDecrypt::AESEncryptDecryptCPUReference(cl_uchar * output ,
+ cl_uchar * input ,
+ cl_uchar * rKey ,
+ cl_uint explandedKeySize,
+ cl_uint width ,
+ cl_uint height ,
+ cl_bool inverse )
+{
+ cl_uchar block[16];
+
+ for(cl_uint blocky = 0; blocky < height/4; ++blocky)
+ for(cl_uint blockx= 0; blockx < width/4; ++blockx)
+ {
+ for(cl_uint i=0; i < 4; ++i)
+ {
+ for(cl_uint j=0; j < 4; ++j)
+ {
+ cl_uint index = (((blocky * width/4) + blockx) * keySize )+ (i*4 + j);
+ block[i*4 + j] = input[index];
+ }
+ }
+
+ if(inverse)
+ aesMainInv(block, rKey, rounds);
+ else
+ aesMain(block, rKey, rounds);
+
+ for(cl_uint i=0; i <4 ; ++i)
+ {
+ for(cl_uint j=0; j <4; ++j)
+ {
+ cl_uint index = (((blocky * width/4) + blockx) * keySize )+ (i*4 + j);
+ output[index] = block[i*4 + j];
+ }
+ }
+ }
+}
+
+
+int
+AESEncryptDecrypt::initialize()
+{
+ // Call base class Initialize to get default configuration
+ if(!this->SDKSample::initialize())
+ return SDK_FAILURE;
+
+ streamsdk::Option* ifilename_opt = new streamsdk::Option;
+ if(!ifilename_opt)
+ {
+ sampleCommon->error("Memory allocation error.\n");
+ return SDK_FAILURE;
+ }
+ ifilename_opt->_sVersion = "x";
+ ifilename_opt->_lVersion = "input";
+ ifilename_opt->_description = "Image as Input";
+ ifilename_opt->_type = streamsdk::CA_ARG_STRING;
+ ifilename_opt->_value = &inFilename;
+ sampleArgs->AddOption(ifilename_opt);
+
+ delete ifilename_opt;
+
+ ////////////////
+ streamsdk::Option* ofilename_opt = new streamsdk::Option;
+ if(!ofilename_opt)
+ {
+ sampleCommon->error("Memory allocation error.\n");
+ return SDK_FAILURE;
+ }
+ ofilename_opt->_sVersion = "y";
+ ofilename_opt->_lVersion = "output";
+ ofilename_opt->_description = "Image as Ouput";
+ ofilename_opt->_type = streamsdk::CA_ARG_STRING;
+ ofilename_opt->_value = &outFilename;
+ sampleArgs->AddOption(ofilename_opt);
+
+ delete ofilename_opt;
+
+ ////////////////
+ streamsdk::Option* decrypt_opt = new streamsdk::Option;
+ if(!decrypt_opt)
+ {
+ sampleCommon->error("Memory allocation error.\n");
+ return SDK_FAILURE;
+ }
+ decrypt_opt->_sVersion = "z";
+ decrypt_opt->_lVersion = "decrypt";
+ decrypt_opt->_description = "Decrypt the Input Image";
+ decrypt_opt->_type = streamsdk::CA_NO_ARGUMENT;
+ decrypt_opt->_value = &decrypt;
+ sampleArgs->AddOption(decrypt_opt);
+
+ delete decrypt_opt;
+
+ streamsdk::Option* num_iterations = new streamsdk::Option;
+ if(!num_iterations)
+ {
+ sampleCommon->error("Memory allocation error.\n");
+ return SDK_FAILURE;
+ }
+
+ num_iterations->_sVersion = "i";
+ num_iterations->_lVersion = "iterations";
+ num_iterations->_description = "Number of iterations for kernel execution";
+ num_iterations->_type = streamsdk::CA_ARG_INT;
+ num_iterations->_value = &iterations;
+
+ sampleArgs->AddOption(num_iterations);
+
+ delete num_iterations;
+
+ return SDK_SUCCESS;
+}
+
+int
+AESEncryptDecrypt::setup()
+{
+
+ std::string filePath = sampleCommon->getPath() + inFilename;
+ image.load(filePath.c_str());
+
+ width = image.getWidth();
+ height = image.getHeight();
+
+ /* check condition for the bitmap to be initialized */
+ if(width<0 || height <0)
+ return SDK_FAILURE;
+
+ pixels = image.getPixels();
+
+ if(setupAESEncryptDecrypt()!=SDK_SUCCESS)
+ return SDK_FAILURE;
+
+ int timer = sampleCommon->createTimer();
+ sampleCommon->resetTimer(timer);
+ sampleCommon->startTimer(timer);
+
+ if(setupCL()!=SDK_SUCCESS)
+ return SDK_FAILURE;
+
+ sampleCommon->stopTimer(timer);
+
+ setupTime = (double)(sampleCommon->readTimer(timer));
+
+ return SDK_SUCCESS;
+}
+
+
+int
+AESEncryptDecrypt::run()
+{
+ int timer = sampleCommon->createTimer();
+ sampleCommon->resetTimer(timer);
+ sampleCommon->startTimer(timer);
+
+ std::cout << "Executing kernel for " << iterations <<
+ " iterations" << std::endl;
+ std::cout << "-------------------------------------------" << std::endl;
+
+ for(int i = 0; i < iterations; i++)
+ {
+ /* Arguments are set and execution call is enqueued on command buffer */
+ if(runCLKernels()!=SDK_SUCCESS)
+ return SDK_FAILURE;
+ }
+
+ sampleCommon->stopTimer(timer);
+ totalKernelTime = (double)(sampleCommon->readTimer(timer)) / iterations;
+
+
+ //XXX: Write output to an output Image
+
+ convertGrayToPixels(output, pixels);
+ image.write(outFilename.c_str());
+
+ if(!quiet) {
+ std::cout << "Output Filename : " << outFilename << std::endl;
+ }
+
+
+ return SDK_SUCCESS;
+}
+
+int
+AESEncryptDecrypt::verifyResults()
+{
+ if(verify)
+ {
+ verificationOutput = (cl_uchar *) malloc(width*height*sizeof(cl_uchar));
+ if(verificationOutput==NULL) {
+ sampleCommon->error("Failed to allocate host memory. (verificationOutput)");
+ return SDK_FAILURE;
+ }
+
+ /*
+ * reference implementation
+ */
+ int refTimer = sampleCommon->createTimer();
+ sampleCommon->resetTimer(refTimer);
+ sampleCommon->startTimer(refTimer);
+ AESEncryptDecryptCPUReference(verificationOutput, input, roundKey, explandedKeySize,
+ width, height, decrypt);
+ sampleCommon->stopTimer(refTimer);
+ referenceKernelTime = sampleCommon->readTimer(refTimer);
+
+ /* compare the results and see if they match */
+ if(memcmp(output, verificationOutput, height*width*sizeof(cl_uchar)) == 0)
+ {
+ std::cout<<"Passed!\n";
+ return SDK_SUCCESS;
+ }
+ else
+ {
+ std::cout<<"Failed\n";
+ return SDK_FAILURE;
+ }
+ }
+
+ return SDK_SUCCESS;
+}
+
+void AESEncryptDecrypt::printStats()
+{
+ std::string strArray[4] = {"Width", "Height", "Time(sec)", "KernelTime(sec)"};
+ std::string stats[4];
+
+ totalTime = setupTime + totalKernelTime;
+
+ stats[0] = sampleCommon->toString(width , std::dec);
+ stats[1] = sampleCommon->toString(height , std::dec);
+ stats[2] = sampleCommon->toString(totalTime, std::dec);
+ stats[3] = sampleCommon->toString(totalKernelTime, std::dec);
+
+ this->SDKSample::printStats(strArray, stats, 4);
+}
+
+int AESEncryptDecrypt::cleanup()
+{
+ /* Releases OpenCL resources (Context, Memory etc.) */
+ cl_int status;
+
+ status = clReleaseKernel(kernel);
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clReleaseKernel failed."))
+ return SDK_FAILURE;
+
+ status = clReleaseProgram(program);
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clReleaseProgram failed."))
+ return SDK_FAILURE;
+
+ status = clReleaseMemObject(inputBuffer);
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clReleaseMemObject failed."))
+ return SDK_FAILURE;
+
+ status = clReleaseMemObject(outputBuffer);
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clReleaseMemObject failed."))
+ return SDK_FAILURE;
+
+ status = clReleaseMemObject(rKeyBuffer);
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clReleaseMemObject failed."))
+ return SDK_FAILURE;
+
+ status = clReleaseMemObject(sBoxBuffer);
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clReleaseMemObject failed."))
+ return SDK_FAILURE;
+
+ status = clReleaseMemObject(rsBoxBuffer);
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clReleaseMemObject failed."))
+ return SDK_FAILURE;
+
+ status = clReleaseCommandQueue(commandQueue);
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clReleaseCommandQueue failed."))
+ return SDK_FAILURE;
+
+ status = clReleaseContext(context);
+ if(!sampleCommon->checkVal(
+ status,
+ CL_SUCCESS,
+ "clReleaseContext failed."))
+ return SDK_FAILURE;
+
+ /* release program resources (input memory etc.) */
+ if(input)
+ free(input);
+
+ if(key)
+ free(key);
+
+ if(expandedKey)
+ free(expandedKey);
+
+ if(roundKey)
+ free(roundKey);
+
+ if(output)
+ free(output);
+
+ if(verificationOutput)
+ free(verificationOutput);
+
+ if(devices)
+ free(devices);
+
+ if(maxWorkItemSizes)
+ free(maxWorkItemSizes);
+
+ return SDK_SUCCESS;
+}
+
+int
+main(int argc, char * argv[])
+{
+ AESEncryptDecrypt clAESEncryptDecrypt("OpenCL AES Encrypt Decrypt");
+
+ if(clAESEncryptDecrypt.initialize()!=SDK_SUCCESS)
+ return SDK_FAILURE;
+ if(!clAESEncryptDecrypt.parseCommandLine(argc, argv))
+ return SDK_FAILURE;
+
+ if(clAESEncryptDecrypt.isDumpBinaryEnabled())
+ {
+ return clAESEncryptDecrypt.genBinaryImage();
+ }
+ else
+ {
+ if(clAESEncryptDecrypt.setup()!=SDK_SUCCESS)
+ return SDK_FAILURE;
+ if(clAESEncryptDecrypt.run()!=SDK_SUCCESS)
+ return SDK_FAILURE;
+ if(clAESEncryptDecrypt.verifyResults()!=SDK_SUCCESS)
+ return SDK_FAILURE;
+ if(clAESEncryptDecrypt.cleanup()!=SDK_SUCCESS)
+ return SDK_FAILURE;
+ clAESEncryptDecrypt.printStats();
+ }
+
+ return SDK_SUCCESS;
+}
+
+#endif
+
+#include "common.h"
+void verify();
+
+ cl_uchar sbox[256] =
+ { 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 //0
+ , 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 //1
+ , 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 //2
+ , 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 //3
+ , 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 //4
+ , 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf //5
+ , 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 //6
+ , 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 //7
+ , 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 //8
+ , 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb //9
+ , 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 //A
+ , 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 //B
+ , 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a //C
+ , 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e //D
+ , 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf //E
+ , 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16};//F
+ //0 1 2 3 4 5 6 7 8 9 A B C D E F
+
+
+ cl_uchar rsbox[256] =
+ { 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+ , 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+ , 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+ , 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+ , 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+ , 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+ , 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+ , 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+ , 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+ , 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+ , 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+ , 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+ , 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+ , 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+ , 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+ , 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d};
+
+
+ cl_uchar Rcon[255] =
+ { 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a
+ , 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39
+ , 0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a
+ , 0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8
+ , 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef
+ , 0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc
+ , 0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b
+ , 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3
+ , 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94
+ , 0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20
+ , 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63, 0xc6, 0x97, 0x35
+ , 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd, 0x61, 0xc2, 0x9f
+ , 0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb, 0x8d, 0x01, 0x02, 0x04
+ , 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36, 0x6c, 0xd8, 0xab, 0x4d, 0x9a, 0x2f, 0x5e, 0xbc, 0x63
+ , 0xc6, 0x97, 0x35, 0x6a, 0xd4, 0xb3, 0x7d, 0xfa, 0xef, 0xc5, 0x91, 0x39, 0x72, 0xe4, 0xd3, 0xbd
+ , 0x61, 0xc2, 0x9f, 0x25, 0x4a, 0x94, 0x33, 0x66, 0xcc, 0x83, 0x1d, 0x3a, 0x74, 0xe8, 0xcb };
+
+char* filename = "input512.bmp";
+int width, height;
+cl_uchar *input;
+cl_uchar *output;
+
+
+int main(int argc, char**argv)
+{
+ struct args args = {0};
+ int err, i;
+
+ parseArgs(argc, argv, &args);
+
+ cl_device_id device = getDeviceID(args.d);
+ cl_context context = clCreateContext(0, 1, &device, NULL, NULL, &err); CHK_ERR(err);
+ cl_command_queue queue = clCreateCommandQueue(context, device, 0, &err); CHK_ERR(err);
+ cl_kernel e_kernel = getKernel(device, context, "aes_kernels.cl", "AESEncrypt");
+ cl_kernel d_kernel = getKernel(device, context, "aes_kernels.cl", "AESDecrypt");
+
+ cl_uchar4 *pixels = (cl_uchar4 *) readBmp(filename, &width, &height);
+
+ cl_uint sizeBytes = width*height*sizeof(cl_uchar);
+ input = newBuffer(sizeBytes, 0);
+
+ /* initialize the input array, do NOTHING but assignment when decrypt*/
+ int decrypt = 0;
+#if 0
+ if(!decrypt)
+ convertColorToGray(pixels, input);
+ else
+ convertGrayToGray(pixels, input);
+#endif
+ output = (cl_uchar*)malloc(sizeBytes);
+
+
+
+ cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+ sizeof(cl_uchar ) * width * height, input, &err); CHK_ERR(err);
+
+ cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
+ sizeof(cl_uchar ) * width * height, output, &err); CHK_ERR(err);
+
+#if 0
+ cl_mem rKeyBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+ sizeof(cl_uchar ) * explandedKeySize, roundKey, &err); CHK_ERR(err);
+
+ cl_uchar *sBox = (cl_uchar *)sbox;
+ cl_mem sBoxBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+ sizeof(cl_uchar ) * 256, sBox, &err); CHK_ERR(err);
+
+ cl_uchar *rsBox = (cl_uchar *)rsbox;
+ clmem rsBoxBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+ sizeof(cl_uchar ) * 256, rsBox, &err); CHK_ERR(err);
+#endif
+
+#if 0
+#if 0
+ for (i = 0; i < MAX; i++) {
+ a[i] = b[i] = (float) i;
+ c[i] = 0.0f;
+ }
+#else
+ a = newBuffer(MAX * sizeof(float), 'f');
+ b = newBuffer(MAX * sizeof(float), 'f');
+ c = newBuffer(MAX * sizeof(float), '0');
+ d = newBuffer(MAX * sizeof(float), '0');
+#endif
+
+ cl_mem da = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, MAX * sizeof(float), a, &err); CHK_ERR(err);
+ cl_mem db = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, MAX * sizeof(float), b, &err); CHK_ERR(err);
+ cl_mem dc = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, MAX * sizeof(float), c, &err); CHK_ERR(err);
+
+
+ /* Execute */
+ int gws = MAX;
+ int lws = 16;
+
+ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &da); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &db); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &dc); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 3, sizeof(size_t), &gws); CHK_ERR(err);
+
+ err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &gws, &lws, 0, NULL, NULL); CHK_ERR(err);
+ err = clEnqueueReadBuffer(queue, dc, CL_TRUE, 0, MAX*sizeof(float), d, 0, NULL, NULL); CHK_ERR(err);
+
+ verify();
+}
+
+void verify()
+{
+ int i;
+ for (i = 0; i < MAX; i++) {
+ float err = d[i] - (a[i] + b[i]);
+
+ err = fabsf(err);
+#define EPS 1.0e-7f // 1.0e-8 fails on cpu
+ if (err >= EPS) {
+ printf("Mismatch: %8d: %8f %8f %8f (err=%g 0x%.08x)\n", i, d[i], a[i], b[i], err, *(unsigned int *) &err);
+ printf("Failed\n");
+ exit(-1);
+ }
+ }
+ printf("Passed\n");
+#endif
+}
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "common.h"
+void verify();
+
+float *randArray, *output;
+float *refOutput, *stepsArray;
+int numSamples = 64;
+int numSteps = 255;
+
+
+int main(int argc, char**argv)
+{
+ struct args args = {0};
+ int err;
+ char *ker_path = NULL;
+
+ parseArgs(argc, argv, &args);
+
+ cl_device_id device = getDeviceID(args.d);
+#if TEST_SIMD8
+ ker_path = do_kiss_path("binomialOption_kernels_0.bin", device);
+#else
+ ker_path = do_kiss_path("binomialOption_kernels_0.bin8", device);
+#endif
+ cl_context context = clCreateContext(0, 1, &device, NULL, NULL, &err); CHK_ERR(err);
+ cl_command_queue queue = clCreateCommandQueue(context, device, 0, &err); CHK_ERR(err);
+ cl_kernel kernel = getKernelFromBinary(device, context, ker_path, "binomial_options");
+
+ randArray = newBuffer(numSamples * sizeof(cl_float4), 'f');
+ output = newBuffer(numSamples * sizeof(cl_float4), 0 );
+ cl_mem randBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+ numSamples * sizeof(cl_float4), randArray, &err);
+ CHK_ERR(err);
+ cl_mem outBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR,
+ numSamples * sizeof(cl_float4), output, &err);
+ CHK_ERR(err);
+
+ /* Execute */
+ size_t gws[1] = {numSamples * (numSteps+1)};
+ size_t lws[1] = {numSteps + 1};
+
+ err = clSetKernelArg(kernel, 0, sizeof(int), &numSteps); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &randBuffer); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &outBuffer); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 3, (numSteps + 1) * sizeof(cl_float4), NULL); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 4, numSteps * sizeof(cl_float4), NULL); CHK_ERR(err);
+ err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, gws, lws, 0, NULL, NULL); CHK_ERR(err);
+#if 0
+ err = clEnqueueReadBuffer(queue, outBuffer, CL_TRUE, 0,
+ numSamples * sizeof(cl_float4), output,
+ 0, NULL, NULL);
+#else
+ output = clIntelMapBuffer(outBuffer, &err);
+ CHK_ERR(err);
+#endif
+
+ verify();
+ return 0;
+}
+
+#define RISKFREE 0.02f
+#define VOLATILITY 0.30f
+
+/*
+ * Reduces the input array (in place)
+ * length specifies the length of the array
+ */
+ void
+binomialOptionCPU()
+{
+ refOutput = newBuffer(numSamples * sizeof(cl_float4), 0);
+ stepsArray = newBuffer((numSteps + 1) * sizeof(cl_float4), 0);
+ int bid;
+
+ /* Iterate for all samples */
+ for(bid = 0; bid < numSamples; ++bid)
+ {
+ float s[4];
+ float x[4];
+ float vsdt[4];
+ float puByr[4];
+ float pdByr[4];
+ float optionYears[4];
+
+ float inRand[4];
+ int i, j, k;
+
+
+ for(i = 0; i < 4; ++i)
+ {
+ inRand[i] = randArray[bid + i];
+ s[i] = (1.0f - inRand[i]) * 5.0f + inRand[i] * 30.f;
+ x[i] = (1.0f - inRand[i]) * 1.0f + inRand[i] * 100.f;
+ optionYears[i] = (1.0f - inRand[i]) * 0.25f + inRand[i] * 10.f;
+ float dt = optionYears[i] * (1.0f / (float)numSteps);
+ vsdt[i] = VOLATILITY * sqrtf(dt);
+ float rdt = RISKFREE * dt;
+ float r = expf(rdt);
+ float rInv = 1.0f / r;
+ float u = expf(vsdt[i]);
+ float d = 1.0f / u;
+ float pu = (r - d)/(u - d);
+ float pd = 1.0f - pu;
+ puByr[i] = pu * rInv;
+ pdByr[i] = pd * rInv;
+ }
+ // Compute values at expiration date:
+ // Call option value at period end is v(t) = s(t) - x
+ // If s(t) is greater than x, or zero otherwise...
+ // The computation is similar for put options...
+ for(j = 0; j <= numSteps; j++)
+ {
+ for(i = 0; i < 4; ++i)
+ {
+ float profit = s[i] * expf(vsdt[i] * (2.0f * j - numSteps)) - x[i];
+ stepsArray[j * 4 + i] = profit > 0.0f ? profit : 0.0f;
+ }
+ }
+
+ //walk backwards up on the binomial tree of depth numSteps
+ //Reduce the price step by step
+ for(j = numSteps; j > 0; --j)
+ {
+ for(k = 0; k <= j - 1; ++k)
+ {
+ for(i = 0; i < 4; ++i)
+ {
+ stepsArray[k * 4 + i] = pdByr[i] * stepsArray[(k + 1) * 4 + i] + puByr[i] * stepsArray[k * 4 + i];
+ }
+ }
+ }
+
+ //Copy the root to result
+ refOutput[bid] = stepsArray[0];
+ }
+}
+
+void verify()
+{
+ binomialOptionCPU();
+ int i;
+ for (i=0; i<20; i++) {
+ printf("%13.8f %13.8f\n", output[i], refOutput[i]);
+ }
+ int resC = comparef(output, refOutput, numSamples * 4, 0.001f);
+ resC ? printf("Passed\n") : printf("Failed\n");
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "common.h"
+void verify();
+
+int length = 1024;
+int *input, *refInput;
+
+int main(int argc, char**argv)
+{
+ struct args args = {0};
+ int err, i;
+
+ parseArgs(argc, argv, &args);
+
+ cl_device_id device = getDeviceID(args.d);
+ cl_context context = clCreateContext(0, 1, &device, NULL, NULL, &err); CHK_ERR(err);
+ cl_command_queue queue = clCreateCommandQueue(context, device, 0, &err); CHK_ERR(err);
+ cl_kernel kernel = getKernel(device, context, "bitonic_kernels.cl", "bitonicSort");
+
+ int length_bytes = length * sizeof(cl_uint);
+
+ input = newBuffer(length_bytes, 0);
+ refInput = newBuffer(length_bytes, 0);
+ for (i = 0; i < length; i++) {
+ input[i] = refInput[i] = rand() & 0x0fffff;
+ }
+ cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
+ length_bytes, input, &err);
+ CHK_ERR(err);
+
+ /*
+ * Execute
+ *
+ * This algorithm is run as NS stages. Each stage has NP passes.
+ * so the total number of times the kernel call is enqueued is NS * NP.
+ *
+ * For every stage S, we have S + 1 passes.
+ * eg: For stage S = 0, we have 1 pass.
+ * For stage S = 1, we have 2 passes.
+ *
+ * if length is 2^N, then the number of stages (numStages) is N.
+ * Do keep in mind the fact that the algorithm only works for
+ * arrays whose size is a power of 2.
+ *
+ * here, numStages is N.
+ *
+ * For an explanation of how the algorithm works, please go through
+ * the documentation of this sample.
+ */
+
+ /*
+ * 2^numStages should be equal to length.
+ * i.e the number of times you halve length to get 1 should be numStages
+ */
+ int numStages = 0;
+ int sortDescending = 1;
+ for(i = length; i > 1; i >>= 1)
+ ++numStages;
+
+ /*** Set appropriate arguments to the kernel ***/
+ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 3, sizeof(cl_uint), &length); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 4, sizeof(cl_uint), &sortDescending); CHK_ERR(err);
+
+ int stage, passOfStage;
+ size_t globalThreads[1] = {length / 2};
+ size_t localThreads[1] = {1};
+
+ for(stage = 0; stage < numStages; stage++) {
+ /* stage of the algorithm */
+ err = clSetKernelArg(kernel, 1, sizeof(cl_uint), &stage);
+ CHK_ERR(err);
+
+ /* Every stage has stage+1 passes. */
+ for(passOfStage = 0; passOfStage < stage + 1; passOfStage++) {
+ /* pass of the current stage */
+ err = clSetKernelArg( kernel, 2, sizeof(cl_uint), &passOfStage);
+ CHK_ERR(err);
+
+ /*
+ * Enqueue a kernel run call.
+ * For simplicity, the groupsize used is 1.
+ *
+ * Each thread writes a sorted pair.
+ * So, the number of threads (global) is half the length.
+ */
+ err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL,
+ globalThreads, localThreads,
+ 0, NULL, NULL);
+ CHK_ERR(err);
+
+ /* wait for the kernel call to finish execution */
+ err = clFinish(queue);
+ CHK_ERR(err);
+ }
+ }
+ err = clEnqueueReadBuffer(queue, inputBuffer, CL_TRUE, 0,
+ length_bytes, input,
+ 0, NULL, NULL);
+ CHK_ERR(err);
+
+ verify();
+
+}
+
+int compare(const void *x, const void *y)
+{
+ unsigned int a = *(unsigned int *)x;
+ unsigned int b = *(unsigned int *)y;
+ return (a-b);
+}
+
+
+void verify()
+{
+ int i;
+
+ qsort(refInput, length, sizeof(int), compare);
+ for (i = 0; i < length; i++) {
+ if (i < 20) {
+ printf("%8d %8d\n", input[i], refInput[i]);
+ }
+ if (input[i] != refInput[i]) {
+ printf("Failed at %d: %d vs. %d\n", i,
+ input[i], refInput[i]);
+ printf("Failed");
+ exit(-1);
+ }
+ }
+ printf("Passed\n");
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "common.h"
+#include "cl_test.h"
+
+void verify(int compareCall, int comparePut);
+
+float *randArray, *devCallPrice, *devPutPrice, *hostCallPrice, *hostPutPrice;
+int width = 32;
+int height = 16;
+
+int main(int argc, char **argv)
+{
+ struct args args = { 0 };
+ char *ker_path = NULL;
+ int err;
+
+ parseArgs(argc, argv, &args);
+
+ cl_device_id device = getDeviceID(args.d);
+#if TEST_SIMD8
+ ker_path = do_kiss_path("blackscholes_kernel_0.bin8", device);
+#else
+ ker_path = do_kiss_path("blackscholes_kernel_0.bin", device);
+#endif
+ cl_context context = clCreateContext(0, 1, &device, NULL, NULL, &err);
+ CHK_ERR(err);
+ cl_command_queue queue = clCreateCommandQueue(context, device, 0, &err);
+ CHK_ERR(err);
+ cl_kernel kernel = getKernelFromBinary(device, context, ker_path, "blackScholes");
+
+ int whf4 = width * height * sizeof(cl_float4);
+
+ randArray = newBuffer(whf4, 'f');
+ devCallPrice = newBuffer(whf4, 0);
+ //devPutPrice = newBuffer(whf4, 0);
+ hostCallPrice = newBuffer(whf4, 0);
+ hostPutPrice = newBuffer(whf4, 0);
+
+ cl_mem randBuf = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, whf4, randArray, &err);
+ CHK_ERR(err);
+ cl_mem callPriceBuf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, whf4, NULL, &err);
+ CHK_ERR(err);
+ cl_mem putPriceBuf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, whf4, NULL, &err);
+ CHK_ERR(err);
+
+ size_t globalThreads[2] = { width, height };
+ //size_t localThreads[2] = { 4, 4 };
+ size_t localThreads[2] = { 16, 1 };
+
+ /* whether sort is to be in increasing order. CL_TRUE implies increasing */
+ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &randBuf);
+ CHK_ERR(err);
+ err = clSetKernelArg(kernel, 1, sizeof(width), (const void *) &width);
+ CHK_ERR(err);
+ err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *) &callPriceBuf);
+ CHK_ERR(err);
+ err = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *) &putPriceBuf);
+ CHK_ERR(err);
+
+ err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL);
+ CHK_ERR(err);
+
+ devPutPrice = clIntelMapBuffer(putPriceBuf, &err);
+ CHK_ERR(err);
+ devCallPrice = clIntelMapBuffer(callPriceBuf, &err);
+ CHK_ERR(err);
+ verify(1, 1);
+#if 0
+ int i;
+ for (i = 0; i < width*height*4; i++) {
+ printf("%12.8f, %12.8f | %12.8f %12.8f\n",
+ devCallPrice[i],
+ hostCallPrice[i],
+ devPutPrice[i],
+ hostPutPrice[i]);
+ fflush(stdout);
+ }
+#endif
+ clIntelUnmapBuffer(putPriceBuf);
+ return err;
+}
+
+#define S_LOWER_LIMIT 10.0f
+#define S_UPPER_LIMIT 100.0f
+#define K_LOWER_LIMIT 10.0f
+#define K_UPPER_LIMIT 100.0f
+#define T_LOWER_LIMIT 1.0f
+#define T_UPPER_LIMIT 10.0f
+#define R_LOWER_LIMIT 0.01f
+#define R_UPPER_LIMIT 0.05f
+#define SIGMA_LOWER_LIMIT 0.01f
+#define SIGMA_UPPER_LIMIT 0.10f
+
+float phi(float X)
+{
+ float y, absX, t;
+
+ // the coeffs
+ const float c1 = 0.319381530f;
+ const float c2 = -0.356563782f;
+ const float c3 = 1.781477937f;
+ const float c4 = -1.821255978f;
+ const float c5 = 1.330274429f;
+
+ const float oneBySqrt2pi = 0.398942280f;
+
+ absX = fabsf(X);
+ t = 1.0f / (1.0f + 0.2316419f * absX);
+
+ y = 1.0f - oneBySqrt2pi * exp(-X * X / 2.0f) *
+ t * (c1 + t * (c2 + t * (c3 + t * (c4 + t * c5))));
+
+ return (X < 0) ? (1.0f - y) : y;
+}
+
+void blackScholesCPU(int width, int height, float *randArray,
+ float *hostCallPrice, float *hostPutPrice)
+{
+ int y;
+ for (y = 0; y < width * height * 4; ++y) {
+ float d1, d2;
+ float sigmaSqrtT;
+ float KexpMinusRT;
+ float s = S_LOWER_LIMIT * randArray[y] + S_UPPER_LIMIT * (1.0f - randArray [y]);
+ float k = K_LOWER_LIMIT * randArray[y] + K_UPPER_LIMIT * (1.0f - randArray [y]);
+ float t = T_LOWER_LIMIT * randArray[y] + T_UPPER_LIMIT * (1.0f - randArray [y]);
+ float r = R_LOWER_LIMIT * randArray[y] + R_UPPER_LIMIT * (1.0f - randArray [y]);
+ float sigma =
+ SIGMA_LOWER_LIMIT * randArray[y] +
+ SIGMA_UPPER_LIMIT * (1.0f - randArray[y]);
+
+ sigmaSqrtT = sigma * sqrt(t);
+
+ d1 = (log(s / k) +
+ (r + sigma * sigma / 2.0f) * t) / sigmaSqrtT;
+ d2 = d1 - sigmaSqrtT;
+
+ KexpMinusRT = k * exp(-r * t);
+ hostCallPrice[y] = s * phi(d1) - KexpMinusRT * phi(d2);
+ hostPutPrice[y] = KexpMinusRT * phi(-d2) - s * phi(-d1);
+ }
+}
+
+void verify(int compareCall, int comparePut)
+{
+ blackScholesCPU(width, height, randArray, hostCallPrice, hostPutPrice);
+ int resC = 1, resP = 1;
+ if (compareCall)
+ resC = comparef(hostCallPrice, devCallPrice, width * height * 4, 1.0e-5f);
+ if (comparePut)
+ resP = comparef(hostPutPrice, devPutPrice, width * height * 4, 1.0e-5f);
+ printf("resC=%d, resP=%d: ", resC, resP);
+ if (resC && resP)
+ printf("Passed!\n");
+ else
+ printf("Failed!\n");
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "cl_file_map.h"
+
+#include "CL/cl.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <assert.h>
+#include <string.h>
+#include <stdio.h>
+
+int
+cl_file_map_init(cl_file_map_t *fm)
+{
+ assert(fm);
+ memset(fm,0,sizeof(*fm));
+ return CL_SUCCESS;
+}
+
+void
+cl_file_map_destroy(cl_file_map_t *fm)
+{
+ if (fm->mapped) {
+ munmap(fm->start, fm->size);
+ fm->start = fm->stop = 0;
+ fm->size = 0;
+ fm->mapped = CL_FALSE;
+ }
+ if(fm->fd) {
+ close(fm->fd);
+ fm->fd = 0;
+ }
+ free(fm->name);
+ memset(fm,0,sizeof(*fm));
+}
+
+void
+cl_file_map_delete(cl_file_map_t *fm)
+{
+ if (fm == NULL)
+ return;
+ cl_file_map_destroy(fm);
+ free(fm);
+}
+
+cl_file_map_t*
+cl_file_map_new(void)
+{
+ cl_file_map_t *fm = NULL;
+
+ if ((fm = calloc(1, sizeof(cl_file_map_t))) == NULL)
+ goto error;
+ if (cl_file_map_init(fm) != CL_SUCCESS)
+ goto error;
+
+exit:
+ return fm;
+error:
+ cl_file_map_delete(fm);
+ fm = NULL;
+ goto exit;
+}
+
+int
+cl_file_map_open(cl_file_map_t *fm, const char *name)
+{
+ int err = CL_FILE_MAP_SUCCESS;
+
+ /* Open the file */
+ fm->fd = open(name, O_RDONLY);
+ if(fm->fd <= 0) {
+ err = CL_FILE_MAP_FILE_NOT_FOUND;
+ goto error;
+ }
+ if ((fm->name = calloc(strlen(name) + 1, sizeof(char))) == NULL)
+ goto error;
+ sprintf(fm->name, "%s", name);
+
+ /* Map it */
+ fm->size = lseek(fm->fd, 0, SEEK_END);
+ lseek(fm->fd, 0, SEEK_SET);
+ fm->start = mmap(0, fm->size, PROT_READ, MAP_SHARED, fm->fd, 0);
+ if(fm->start <= 0) {
+ err = CL_FILE_MAP_FAILED_TO_MMAP;
+ goto error;
+ }
+
+ fm->stop = ((char *) fm->start) + fm->size;
+ fm->mapped = CL_TRUE;
+
+exit:
+ return err;
+error:
+ cl_file_map_destroy(fm);
+ goto exit;
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#ifndef __CL_FILE_MAP_H__
+#define __CL_FILE_MAP_H__
+
+#include "CL/cl.h"
+#include <stdlib.h>
+
+/* Map a file into memory for direct / cached / simple accesses */
+typedef struct cl_file_map {
+ void *start, *stop; /* First character and last one */
+ size_t size; /* Total size of the file */
+ int fd; /* Posix file descriptor */
+ cl_bool mapped; /* Indicate if a file was mapped or not */
+ char *name; /* File itself */
+} cl_file_map_t;
+
+/* Report information about an open temptative */
+enum {
+ CL_FILE_MAP_SUCCESS = 0,
+ CL_FILE_MAP_FILE_NOT_FOUND = 1,
+ CL_FILE_MAP_FAILED_TO_MMAP = 2
+};
+
+/* Allocate and Initialize a file mapper (but do not map any file */
+extern cl_file_map_t *cl_file_map_new(void);
+
+/* Initialize a file mapper (but do not map any file */
+extern int cl_file_map_init(cl_file_map_t *fm);
+
+/* Destroy but do not deallocate a file map */
+extern void cl_file_map_destroy(cl_file_map_t *fm);
+
+/* Destroy and free it */
+extern void cl_file_map_delete(cl_file_map_t *fm);
+
+/* Open a file and returns the error code */
+extern int cl_file_map_open(cl_file_map_t *fm, const char *name);
+
+static inline cl_bool
+cl_file_map_is_mapped(const cl_file_map_t *fm) {
+ return fm->mapped;
+}
+
+static inline const char*
+cl_file_map_begin(const cl_file_map_t *fm) {
+ return (const char*) fm->start;
+}
+
+static inline const char*
+cl_file_map_end(const cl_file_map_t *fm) {
+ return (const char*) fm->stop;
+}
+
+static inline size_t
+cl_file_map_size(const cl_file_map_t *fm) {
+ return fm->size;
+}
+
+#endif /* __CL_FILE_MAP_H__ */
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "cl_file_map.h"
+#include "cl_test.h"
+#include "common.h"
+#include "CL/cl.h"
+#include "CL/cl_intel.h"
+
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+#include <assert.h>
+
+#define FATAL(...) \
+do { \
+ fprintf(stderr, "error: "); \
+ fprintf(stderr, __VA_ARGS__); \
+ assert(0); \
+ exit(-1); \
+} while (0)
+
+#define FATAL_IF(COND, ...) \
+do { \
+ if (COND) FATAL(__VA_ARGS__); \
+} while (0)
+
+cl_platform_id platform;
+cl_device_id device;
+cl_context ctx;
+cl_program program;
+cl_kernel kernel;
+cl_command_queue queue;
+
+static const char*
+cl_test_channel_order_string(cl_channel_order order)
+{
+ switch(order) {
+#define DECL_ORDER(WHICH) case CL_##WHICH: return "CL_"#WHICH
+ DECL_ORDER(R);
+ DECL_ORDER(A);
+ DECL_ORDER(RG);
+ DECL_ORDER(RA);
+ DECL_ORDER(RGB);
+ DECL_ORDER(RGBA);
+ DECL_ORDER(BGRA);
+ DECL_ORDER(ARGB);
+ DECL_ORDER(INTENSITY);
+ DECL_ORDER(LUMINANCE);
+ DECL_ORDER(Rx);
+ DECL_ORDER(RGx);
+ DECL_ORDER(RGBx);
+#undef DECL_ORDER
+ default: return "Unsupported image channel order";
+ };
+}
+
+static const char*
+cl_test_channel_type_string(cl_channel_type type)
+{
+ switch(type) {
+#define DECL_TYPE(WHICH) case CL_##WHICH: return "CL_"#WHICH
+ DECL_TYPE(SNORM_INT8);
+ DECL_TYPE(SNORM_INT16);
+ DECL_TYPE(UNORM_INT8);
+ DECL_TYPE(UNORM_INT16);
+ DECL_TYPE(UNORM_SHORT_565);
+ DECL_TYPE(UNORM_SHORT_555);
+ DECL_TYPE(UNORM_INT_101010);
+ DECL_TYPE(SIGNED_INT8);
+ DECL_TYPE(SIGNED_INT16);
+ DECL_TYPE(SIGNED_INT32);
+ DECL_TYPE(UNSIGNED_INT8);
+ DECL_TYPE(UNSIGNED_INT16);
+ DECL_TYPE(UNSIGNED_INT32);
+ DECL_TYPE(HALF_FLOAT);
+ DECL_TYPE(FLOAT);
+#undef DECL_TYPE
+ default: return "Unsupported image channel type";
+ };
+}
+
+int
+cl_test_init(const char *file_name, const char *kernel_name, int format)
+{
+ cl_file_map_t *fm = NULL;
+ cl_int status = CL_SUCCESS;
+ char *ker_path = NULL;
+ char name[128];
+ cl_uint platform_n;
+ size_t i;
+
+ /* Get the platform number */
+ CALL (clGetPlatformIDs, 0, NULL, &platform_n);
+ printf("platform number %u\n", platform_n);
+ assert(platform_n >= 1);
+
+ /* Get a valid platform */
+ CALL (clGetPlatformIDs, 1, &platform, &platform_n);
+ CALL (clGetPlatformInfo, platform, CL_PLATFORM_PROFILE, sizeof(name), name, NULL);
+ printf("platform_profile \"%s\"\n", name);
+ CALL (clGetPlatformInfo, platform, CL_PLATFORM_NAME, sizeof(name), name, NULL);
+ printf("platform_name \"%s\"\n", name);
+ CALL (clGetPlatformInfo, platform, CL_PLATFORM_VENDOR, sizeof(name), name, NULL);
+ printf("platform_vendor \"%s\"\n", name);
+ CALL (clGetPlatformInfo, platform, CL_PLATFORM_VERSION, sizeof(name), name, NULL);
+ printf("platform_version \"%s\"\n", name);
+
+ /* Get the device (only GPU device is supported right now) */
+ CALL (clGetDeviceIDs, platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);
+ CALL (clGetDeviceInfo, device, CL_DEVICE_PROFILE, sizeof(name), name, NULL);
+ printf("device_profile \"%s\"\n", name);
+ CALL (clGetDeviceInfo, device, CL_DEVICE_NAME, sizeof(name), name, NULL);
+ printf("device_name \"%s\"\n", name);
+ CALL (clGetDeviceInfo, device, CL_DEVICE_VENDOR, sizeof(name), name, NULL);
+ printf("device_vendor \"%s\"\n", name);
+ CALL (clGetDeviceInfo, device, CL_DEVICE_VERSION, sizeof(name), name, NULL);
+ printf("device_version \"%s\"\n", name);
+
+ /* Now create a context */
+ ctx = clCreateContext(0, 1, &device, NULL, NULL, &status);
+ if (status != CL_SUCCESS) {
+ fprintf(stderr, "error calling clCreateContext\n");
+ goto error;
+ }
+
+ /* All image types currently supported by the context */
+ cl_image_format fmt[256];
+ cl_uint fmt_n;
+ clGetSupportedImageFormats(ctx, 0, CL_MEM_OBJECT_IMAGE2D, 256, fmt, &fmt_n);
+ printf("%u image formats are supported\n", fmt_n);
+ for (i = 0; i < fmt_n; ++i)
+ printf("[%s %s]\n",
+ cl_test_channel_order_string(fmt[i].image_channel_order),
+ cl_test_channel_type_string(fmt[i].image_channel_data_type));
+
+ /* Load the program and build it */
+ ker_path = do_kiss_path(file_name, device);
+ if (format == LLVM)
+ program = clCreateProgramWithLLVM(ctx, 1, &device, ker_path, &status);
+ else if (format == SOURCE) {
+ cl_file_map_t *fm = cl_file_map_new();
+ FATAL_IF (cl_file_map_open(fm, ker_path) != CL_FILE_MAP_SUCCESS,
+ "Failed to open file");
+ const char *src = cl_file_map_begin(fm);
+ const size_t sz = cl_file_map_size(fm);
+ program = clCreateProgramWithSource(ctx, 1, &src, &sz, &status);
+ } else
+ FATAL("Not able to create program from binary");
+
+ if (status != CL_SUCCESS) {
+ fprintf(stderr, "error calling clCreateProgramWithBinary\n");
+ goto error;
+ }
+
+ /* OCL requires to build the program even if it is created from a binary */
+ CALL (clBuildProgram, program, 1, &device, NULL, NULL, NULL);
+
+ /* Create a kernel from the program */
+ kernel = clCreateKernel(program, kernel_name, &status);
+ if (status != CL_SUCCESS) {
+ fprintf(stderr, "error calling clCreateKernel\n");
+ goto error;
+ }
+
+ /* We are going to push NDRange kernels here */
+ queue = clCreateCommandQueue(ctx, device, 0, &status);
+ if (status != CL_SUCCESS) {
+ fprintf(stderr, "error calling clCreateCommandQueue\n");
+ goto error;
+ }
+
+exit:
+ free(ker_path);
+ cl_file_map_delete(fm);
+ return status;
+error:
+ goto exit;
+}
+
+void
+cl_test_destroy(void)
+{
+ clReleaseCommandQueue(queue);
+ clReleaseKernel(kernel);
+ clReleaseProgram(program);
+ clReleaseContext(ctx);
+}
+
+static const char *err_msg[] = {
+ [-CL_SUCCESS] = "CL_SUCCESS",
+ [-CL_DEVICE_NOT_FOUND] = "CL_DEVICE_NOT_FOUND",
+ [-CL_DEVICE_NOT_AVAILABLE] = "CL_DEVICE_NOT_AVAILABLE",
+ [-CL_COMPILER_NOT_AVAILABLE] = "CL_COMPILER_NOT_AVAILABLE",
+ [-CL_MEM_ALLOCATION_FAILURE] = "CL_MEM_ALLOCATION_FAILURE",
+ [-CL_OUT_OF_RESOURCES] = "CL_OUT_OF_RESOURCES",
+ [-CL_OUT_OF_HOST_MEMORY] = "CL_OUT_OF_HOST_MEMORY",
+ [-CL_PROFILING_INFO_NOT_AVAILABLE] = "CL_PROFILING_INFO_NOT_AVAILABLE",
+ [-CL_MEM_COPY_OVERLAP] = "CL_MEM_COPY_OVERLAP",
+ [-CL_IMAGE_FORMAT_MISMATCH] = "CL_IMAGE_FORMAT_MISMATCH",
+ [-CL_IMAGE_FORMAT_NOT_SUPPORTED] = "CL_IMAGE_FORMAT_NOT_SUPPORTED",
+ [-CL_BUILD_PROGRAM_FAILURE] = "CL_BUILD_PROGRAM_FAILURE",
+ [-CL_MAP_FAILURE] = "CL_MAP_FAILURE",
+ [-CL_MISALIGNED_SUB_BUFFER_OFFSET] = "CL_MISALIGNED_SUB_BUFFER_OFFSET",
+ [-CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST] = "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST",
+ [-CL_INVALID_VALUE] = "CL_INVALID_VALUE",
+ [-CL_INVALID_DEVICE_TYPE] = "CL_INVALID_DEVICE_TYPE",
+ [-CL_INVALID_PLATFORM] = "CL_INVALID_PLATFORM",
+ [-CL_INVALID_DEVICE] = "CL_INVALID_DEVICE",
+ [-CL_INVALID_CONTEXT] = "CL_INVALID_CONTEXT",
+ [-CL_INVALID_QUEUE_PROPERTIES] = "CL_INVALID_QUEUE_PROPERTIES",
+ [-CL_INVALID_COMMAND_QUEUE] = "CL_INVALID_COMMAND_QUEUE",
+ [-CL_INVALID_HOST_PTR] = "CL_INVALID_HOST_PTR",
+ [-CL_INVALID_MEM] = "CL_INVALID_MEM",
+ [-CL_INVALID_IMAGE_FORMAT_DESCRIPTOR] = "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
+ [-CL_INVALID_IMAGE_SIZE] = "CL_INVALID_IMAGE_SIZE",
+ [-CL_INVALID_SAMPLER] = "CL_INVALID_SAMPLER",
+ [-CL_INVALID_BINARY] = "CL_INVALID_BINARY",
+ [-CL_INVALID_BUILD_OPTIONS] = "CL_INVALID_BUILD_OPTIONS",
+ [-CL_INVALID_PROGRAM] = "CL_INVALID_PROGRAM",
+ [-CL_INVALID_PROGRAM_EXECUTABLE] = "CL_INVALID_PROGRAM_EXECUTABLE",
+ [-CL_INVALID_KERNEL_NAME] = "CL_INVALID_KERNEL_NAME",
+ [-CL_INVALID_KERNEL_DEFINITION] = "CL_INVALID_KERNEL_DEFINITION",
+ [-CL_INVALID_KERNEL] = "CL_INVALID_KERNEL",
+ [-CL_INVALID_ARG_INDEX] = "CL_INVALID_ARG_INDEX",
+ [-CL_INVALID_ARG_VALUE] = "CL_INVALID_ARG_VALUE",
+ [-CL_INVALID_ARG_SIZE] = "CL_INVALID_ARG_SIZE",
+ [-CL_INVALID_KERNEL_ARGS] = "CL_INVALID_KERNEL_ARGS",
+ [-CL_INVALID_WORK_DIMENSION] = "CL_INVALID_WORK_DIMENSION",
+ [-CL_INVALID_WORK_GROUP_SIZE] = "CL_INVALID_WORK_GROUP_SIZE",
+ [-CL_INVALID_WORK_ITEM_SIZE] = "CL_INVALID_WORK_ITEM_SIZE",
+ [-CL_INVALID_GLOBAL_OFFSET] = "CL_INVALID_GLOBAL_OFFSET",
+ [-CL_INVALID_EVENT_WAIT_LIST] = "CL_INVALID_EVENT_WAIT_LIST",
+ [-CL_INVALID_EVENT] = "CL_INVALID_EVENT",
+ [-CL_INVALID_OPERATION] = "CL_INVALID_OPERATION",
+ [-CL_INVALID_GL_OBJECT] = "CL_INVALID_GL_OBJECT",
+ [-CL_INVALID_BUFFER_SIZE] = "CL_INVALID_BUFFER_SIZE",
+ [-CL_INVALID_MIP_LEVEL] = "CL_INVALID_MIP_LEVEL",
+ [-CL_INVALID_GLOBAL_WORK_SIZE] = "CL_INVALID_GLOBAL_WORK_SIZE",
+ [-CL_INVALID_PROPERTY] = "CL_INVALID_PROPERTY"
+};
+static const size_t err_msg_n = sizeof(err_msg) / sizeof(err_msg[0]);
+
+void
+cl_report_error(cl_int err)
+{
+ if (err > 0)
+ return;
+ if (-err > err_msg_n)
+ return;
+ if (err == CL_SUCCESS)
+ return;
+ fprintf(stderr, "error %s\n", err_msg[-err]);
+}
+
+void
+cl_report_perf_counters(cl_mem perf)
+{
+ cl_int status = CL_SUCCESS;
+ uint32_t *start = NULL, *end = NULL;
+ uint32_t i;
+ if (perf == NULL)
+ return;
+ start = clIntelMapBuffer(perf, &status);
+ assert(status == CL_SUCCESS && start != NULL);
+ end = start + 128;
+
+ printf("BEFORE\n");
+ for (i = 0; i < 6*8; ++i) {
+ if (i % 8 == 0) printf("\n");
+ printf("[%3u 0x%8x] ", i, start[i]);
+ }
+ printf("\n\n");
+
+ printf("AFTER\n");
+ for (i = 0; i < 6*8; ++i) {
+ if (i % 8 == 0) printf("\n");
+ printf("[%3u 0x%8x] ", i, end[i]);
+ }
+ printf("\n\n");
+
+ printf("DIFF\n");
+ for (i = 0; i < 6*8; ++i) {
+ if (i % 8 == 0) printf("\n");
+ printf("[%3u %8i] ", i, end[i] - start[i]);
+ }
+ printf("\n\n");
+
+ clIntelUnmapBuffer(perf);
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#ifndef __CL_TEST_H__
+#define __CL_TEST_H__
+
+#include "CL/cl.h"
+#include "common.h"
+
+#define TEST_SIMD8 0
+#define CALL(FN, ...) \
+ do { \
+ status = FN(__VA_ARGS__); \
+ if (status != CL_SUCCESS) { \
+ fprintf(stderr, "error calling %s\n", #FN); \
+ goto error; \
+ } \
+ } while (0)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern cl_platform_id platform;
+extern cl_device_id device;
+extern cl_context ctx;
+extern cl_program program;
+extern cl_kernel kernel;
+extern cl_command_queue queue;
+
+enum {
+ SOURCE = 0,
+ LLVM = 1,
+ BIN = 2
+};
+
+/* Init the bunch of global varaibles here */
+extern int cl_test_init(const char *file_name, const char *kernel_name, int format);
+
+/* Release everything allocated in cl_test_init */
+extern void cl_test_destroy(void);
+
+/* Properly report the error in stderr */
+extern void cl_report_error(cl_int err);
+
+/* Nicely output the performance counters */
+extern void cl_report_perf_counters(cl_mem perf);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CL_TEST_H__ */
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "common.h"
+
+#include <stdlib.h>
+#include <malloc.h>
+
+void usage()
+{
+ printf("Bad usage\n");
+ exit(0);
+}
+
+void clpanic(const char *msg, int rval)
+{
+ printf("Failed: %s (%d)\n", msg, rval);
+ exit(-1);
+}
+
+void parseArgs(int argc, char **argv, struct args *pargs)
+{
+ char *p;
+ argv++;
+
+ while ((p = *argv++) != NULL) {
+ if (*p == '-') {
+ switch (*++p) {
+ case 'x':
+ pargs->x = atoi(*argv++);
+ break;
+ case 'y':
+ pargs->y = atoi(*argv++);
+ break;
+ case 'z':
+ pargs->z = atoi(*argv++);
+ break;
+ case 'W':
+ pargs->W = atoi(*argv++);
+ break;
+ case 'i':
+ pargs->i = atoi(*argv++);
+ break;
+ case 'v':
+ pargs->v = 1;
+ break;
+ case 'd':
+ pargs->d = atoi(*argv++);
+ break;
+ default:
+ usage();
+ break;
+ }
+ } else {
+ usage();
+ }
+ }
+ printf("x %d, y %d, z %d, W %d, d %d, v %d, i %d\n",
+ pargs->x, pargs->y, pargs->z, pargs->W, pargs->d, pargs->v,
+ pargs->i);
+
+}
+
+float randf()
+{
+ return ((float) rand())/((float)RAND_MAX);
+}
+
+float randf2(float lo, float hi)
+{
+ assert(lo <= hi);
+ return lo + (hi - lo) * randf();
+}
+
+cl_device_id getDeviceID(int devtype)
+{
+ int rval;
+ cl_platform_id platform;
+ cl_device_id device;
+
+ rval = clGetPlatformIDs(1, &platform, NULL);
+ if (rval != CL_SUCCESS)
+ clpanic("clGetPlatformIDs", rval);
+
+ rval =
+ clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
+ if (rval != CL_SUCCESS)
+ clpanic("GetDeviceIDs", rval);
+
+ return device;
+}
+
+int filesize(FILE *fp)
+{
+ fseek(fp, 0L, SEEK_END);
+ int fsize = ftell(fp);
+ fseek(fp, 0L, SEEK_SET); // Note this resets fp to beginning.
+ return fsize;
+}
+
+cl_kernel
+getKernel(cl_device_id dev, cl_context ctx,
+ const char *filename,
+ const char *kernelname)
+{
+ FILE *fp;
+ char *my_program;
+ size_t fsize;
+ int e;
+
+ fp = fopen(filename, "r");
+ assert(fp);
+ fsize = filesize(fp);
+
+ printf("File: %s, size=%d\n", filename, (int) fsize);
+ my_program = malloc(fsize);
+ assert(my_program);
+ e = fread(my_program, 1, fsize, fp);
+ assert(e == fsize);
+
+ cl_program program =
+ clCreateProgramWithSource(ctx, 1, (const char **) &my_program, &fsize, &e);
+ CHK_ERR(e);
+ e = clBuildProgram(program, 1, &dev, NULL, NULL, NULL);
+ CHK_ERR(e);
+
+ if (e != CL_SUCCESS) {
+ /* Print out build log */
+ char build_log[1000];
+ size_t logsize = sizeof(build_log) - 1;
+ e = clGetProgramBuildInfo(program, dev,
+ CL_PROGRAM_BUILD_LOG, logsize,
+ build_log, &logsize);
+ build_log[logsize] = 0;
+ printf("Build Failed:\n%s\n", build_log);
+ exit(-1);
+ }
+
+ cl_kernel kernel = clCreateKernel(program, kernelname, &e);
+ CHK_ERR(e);
+ printf("Extracted kernel %s from file %s\n", kernelname, filename);
+ return kernel;
+}
+
+cl_kernel
+getKernelFromBinary(cl_device_id dev, cl_context ctx,
+ const char *filename,
+ const char *kernelname)
+{
+ FILE *fp;
+ char *my_program;
+ size_t fsize;
+ int e;
+
+ fp = fopen(filename, "r");
+ assert(fp);
+ fsize = filesize(fp);
+
+ printf("File: %s, size=%d\n", filename, (int) fsize);
+ my_program = malloc(fsize);
+ assert(my_program);
+ e = fread(my_program, 1, fsize, fp);
+ assert(e == fsize);
+
+ cl_program program = clCreateProgramWithBinary(ctx,
+ 1, &dev, &fsize,
+ (const unsigned char **) &my_program,
+ NULL, &e);
+ CHK_ERR(e);
+ e = clBuildProgram(program, 1, &dev, NULL, NULL, NULL);
+ CHK_ERR(e);
+ cl_kernel kernel = clCreateKernel(program, kernelname, &e);
+ CHK_ERR(e);
+ printf("Extracted kernel %s from file %s\n", kernelname, filename);
+ return kernel;
+}
+
+void *newBuffer(int bufsiz, int etype)
+{
+ char *p;
+ float *f, scale;
+ int i;
+
+ void *buf = (void *) memalign(32, bufsiz);
+ assert(buf);
+ rand();
+
+ switch (etype) {
+ case 0:
+ case '0':
+ memset(buf, 0, bufsiz);
+ break;
+
+ case 'i':
+ for (i = 0, p = buf; i < bufsiz; i++) {
+ p[i] = (rand() & 0xff);
+ }
+ break;
+
+ case 'f':
+ case 'p':
+ scale = (etype == 'p') ? 255.0f : 1.0f;
+ for (i = 0, f = buf; i < (bufsiz / sizeof(float)); i++) {
+ f[i] = scale * ((float) rand()) / RAND_MAX;
+ }
+ break;
+
+ default:
+ assert(0);
+ }
+ return buf;
+
+}
+
+int comparef(const float *refData, const float *data, int n, float eps)
+{
+ float err = 0.0f;
+ float ref = 0.0f;
+ int i;
+
+ for (i = 1; i < n; ++i) {
+ float diff = refData[i] - data[i];
+ err += diff * diff;
+ ref += refData[i] * refData[i];
+ }
+
+ float normRef = sqrtf(ref);
+ if (fabsf(ref) < 1e-7f) {
+ printf("*FAIL* comparef: ref < 1e-7 (%12.8f)\n", ref);
+ return 0;
+ }
+ float normError = sqrtf(err);
+ err = normError / normRef;
+ printf("comparef: err=%12.8f, eps=%12.8f\n", err, eps);
+ if (err < eps) {
+ printf("PASSED\n");
+ return 1;
+ } else {
+ printf("FAILED\n");
+ return 0;
+ }
+}
+
+#if 0
+// Unit test
+int main(int argc, char **argv)
+{
+ struct args args = { 0 };
+
+ parseArgs(argc, argv, &args);
+ printf("x %d, y %d, z %d, W %d, d %d, v %d, i %d\n",
+ args.x, args.y, args.z, args.W, args.d, args.v, args.i);
+
+}
+#endif
+
+struct bmphdr {
+ // 2 bytes of magic here, "BM", total header size is 54 bytes!
+ int filesize; // 4 total file size incl header
+ short as0, as1; // 8 app specific
+ int bmpoffset; // 12 ofset of bmp data
+ int headerbytes; // 16 bytes in header from this point (40 actually)
+ int width; // 20
+ int height; // 24
+ short nplanes; // 26 no of color planes
+ short bpp; // 28 bits/pixel
+ int compression; // 32 BI_RGB = 0 = no compression
+ int sizeraw; // 36 size of raw bmp file, excluding header, incl padding
+ int hres; // 40 horz resolutions pixels/meter
+ int vres; // 44
+ int npalcolors; // 48 No of colors in palette
+ int nimportant; // 52 No of important colors
+ // raw b, g, r data here, dword aligned per scan line
+};
+
+int *readBmp(const char *filename, int *width, int *height)
+{
+#ifndef NDEBUG
+ int n = 0;
+#endif /* NDEBUG */
+ struct bmphdr hdr;
+
+ FILE *fp = fopen(filename, "rb");
+ assert(fp);
+
+ char magic[2];
+ IF_DEBUG(n =) fread(&magic[0], 1, 2, fp);
+ assert(n == 2 && magic[0] == 'B' && magic[1] == 'M');
+
+ IF_DEBUG(n =) fread(&hdr, 1, sizeof(hdr), fp);
+ assert(n == sizeof(hdr));
+
+#define DEBUG 1
+#ifdef DEBUG
+ // Dump stuff out
+ printf(" filesize = %d\n", hdr.filesize); // total file size incl header
+ printf(" as0 = %d\n", hdr.as0);
+ printf(" as1 = %d\n", hdr.as1);
+ printf(" bmpoffset = %d\n", hdr.bmpoffset); // ofset of bmp data
+ printf("headerbytes = %d\n", hdr.headerbytes); // bytes in header from this point (40 actually)
+ printf(" width = %d\n", hdr.width);
+ printf(" height = %d\n", hdr.height);
+ printf(" nplanes = %d\n", hdr.nplanes); // no of color planes
+ printf(" bpp = %d\n", hdr.bpp); // bits/pixel
+ printf("compression = %d\n", hdr.compression); // BI_RGB = 0 = no compression
+ printf(" sizeraw = %d\n", hdr.sizeraw); // size of raw bmp file, excluding header, incl padding
+ printf(" hres = %d\n", hdr.hres); // horz resolutions pixels/meter
+ printf(" vres = %d\n", hdr.vres);
+ printf(" npalcolors = %d\n", hdr.npalcolors); // No of colors in palette
+ printf(" nimportant = %d\n", hdr.nimportant); // No of important colors
+#endif
+ assert(hdr.width > 0 && hdr.height > 0 && hdr.nplanes == 1 && hdr.compression == 0);
+
+ int *rgb32 = (int *) malloc(hdr.width * hdr.height * sizeof(int));
+ assert(rgb32);
+ int x, y;
+
+ int *dst = rgb32;
+ for (y = 0; y < hdr.height; y++) {
+ for (x = 0; x < hdr.width; x++) {
+ assert(!feof(fp));
+ int b = (getc(fp) & 0x0ff);
+ int g = (getc(fp) & 0x0ff);
+ int r = (getc(fp) & 0x0ff);
+ *dst++ = (r | (g << 8) | (b << 16) | 0xff000000); /* abgr */
+ }
+ while (x & 3) {
+ getc(fp);
+ x++;
+ } // each scanline padded to dword
+ // printf("read row %d\n", y);
+ // fflush(stdout);
+ }
+ fclose(fp);
+ *width = hdr.width;
+ *height = hdr.height;
+ return rgb32;
+}
+
+void writeBmp(const int *data, int width, int height, const char *filename)
+{
+#ifndef NDEBUG
+ int n = 0;
+#endif /* NDEBUG */
+ int x, y;
+
+ FILE *fp = fopen(filename, "wb");
+ assert(fp);
+
+ char *raw = (char *) malloc(width * height * sizeof(int)); // at most
+ assert(raw);
+ char *p = raw;
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x++) {
+ int c = *data++;
+ *p++ = ((c >> 16) & 0x0ff);
+ *p++ = ((c >> 8) & 0x0ff);
+ *p++ = ((c >> 0) & 0x0ff);
+ }
+ while (x & 3) {
+ *p++ = 0;
+ x++;
+ } // pad to dword
+ }
+ int sizeraw = p - raw;
+ int scanline = (width * 3 + 3) & ~3;
+ assert(sizeraw == scanline * height);
+
+ struct bmphdr hdr;
+
+ hdr.filesize = scanline * height + sizeof(hdr) + 2;
+ hdr.as0 = 0;
+ hdr.as1 = 0;
+ hdr.bmpoffset = sizeof(hdr) + 2;
+ hdr.headerbytes = 40;
+ hdr.width = width;
+ hdr.height = height;
+ hdr.nplanes = 1;
+ hdr.bpp = 24;
+ hdr.compression = 0;
+ hdr.sizeraw = sizeraw;
+ hdr.hres = 0; // 2834;
+ hdr.vres = 0; // 2834;
+ hdr.npalcolors = 0;
+ hdr.nimportant = 0;
+
+ /* Now write bmp file */
+ char magic[2] = { 'B', 'M' };
+ IF_DEBUG(n =) fwrite(&magic[0], 1, 2, fp);
+ assert(n == 2);
+
+ IF_DEBUG(n =) fwrite(&hdr, 1, sizeof(hdr), fp);
+ assert(n == sizeof(hdr));
+
+ IF_DEBUG(n =) fwrite(raw, 1, hdr.sizeraw, fp);
+ assert(n == hdr.sizeraw);
+
+ fclose(fp);
+ free(raw);
+
+#ifdef DEBUG
+ printf("Write bmp file %s\n", filename);
+#endif
+}
+
+char*
+readFulsimDump(const char *name, size_t *size)
+{
+ char *raw = NULL, *dump = NULL;
+ size_t i, sz;
+ int w, h;
+ if ((raw = (char*) readBmp(name, &w, &h)) == NULL)
+ return NULL;
+ sz = w * h;
+ dump = (char*) malloc(sz);
+ assert(dump);
+ for (i = 0; i < sz; ++i)
+ dump[i] = raw[4*i];
+ free(raw);
+ if (size)
+ *size = sz;
+ return dump;
+}
+
+char *do_kiss_path(const char *file, cl_device_id device)
+{
+ cl_int ver;
+ const char *sub_path = NULL;
+ char *ker_path = NULL;
+ const char *kiss_path = getenv("OCL_KERNEL_PATH");
+ size_t sz = strlen(file);
+
+ if (device == NULL)
+ sub_path = "";
+ else {
+ if (clIntelGetGenVersion(device, &ver) != CL_SUCCESS)
+ clpanic("Unable to get Gen version", -1);
+#if 0
+ if (ver == 6)
+ sub_path = "gen6/";
+ else if (ver == 7)
+ sub_path = "gen7/";
+ else if (ver == 75)
+ sub_path = "gen75/";
+ else
+ clpanic("unknow gen device", -1);
+#else
+
+ sub_path = "";
+#endif
+ }
+
+ if (kiss_path == NULL)
+ clpanic("set OCL_KERNEL_PATH. This is where the kiss kernels are", -1);
+ sz += strlen(kiss_path) + strlen(sub_path) + 2; /* +1 for end of string, +1 for '/' */
+ if ((ker_path = malloc(sz)) == NULL)
+ clpanic("Allocation failed", -1);
+ sprintf(ker_path, "%s/%s%s", kiss_path, sub_path, file);
+ return ker_path;
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#ifndef __COMMON_H__
+#define __COMMON_H__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+#include <CL/cl.h>
+#include <CL/cl_intel.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct args {
+ int x, y, z;
+ int W; /* -W nnn workgroup size */
+ int d; /* -d [cpu|gpu] 0 or 1 */
+ int v; /* -v verify */
+ int i; /* -i iteration */
+};
+
+#ifdef NDEBUG
+#define IF_DEBUG(EXPR)
+#else
+#define IF_DEBUG(EXPR) EXPR
+#endif
+
+#define CHK_ERR(x) assert(x == CL_SUCCESS)
+
+extern void usage();
+extern void parseArgs(int argcc, char **argv, struct args * pargs);
+extern void clpanic(const char *msg, int rval);
+extern float randf();
+extern float randf2(float lo, float hi);
+extern cl_device_id getDeviceID(int devtype);
+extern cl_kernel getKernel(cl_device_id dev, cl_context ctx,
+ const char *filename, const char *kernelname);
+extern cl_kernel getKernelFromBinary(cl_device_id dev, cl_context ctx,
+ const char *filename,
+ const char *kernelname);
+extern void *newBuffer(int bufsiz, int etype);
+int comparef(const float *refData, const float *data, int n, float eps);
+int *readBmp(const char *filename, int *width, int *height);
+void writeBmp(const int *data, int width, int height, const char *filename);
+char* readFulsimDump(const char *name, size_t *size);
+extern char *do_kiss_path(const char *file, cl_device_id device);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __COMMON_H__ */
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "common.h"
+#include "cl_test.h"
+
+void verify(float*);
+
+int width = 16;
+int height = 16;
+int blockWidth = 8;
+int blockSize = 8 * 8;
+int inverse = 0;
+
+float *input, *output, *ref;
+
+int main(int argc, char**argv)
+{
+ struct args args = {0};
+ int err, i;
+ char *ker_path = NULL;
+
+ const cl_float a = cos(M_PI/16)/2;
+ const cl_float b = cos(M_PI/8 )/2;
+ const cl_float c = cos(3*M_PI/16)/2;
+ const cl_float d = cos(5*M_PI/16)/2;
+ const cl_float e = cos(3*M_PI/8)/2;
+ const cl_float f = cos(7*M_PI/16)/2;
+ const cl_float g = 1.0f/sqrt(8.0f);
+
+ cl_float dct8x8[64] = {
+ g, a, b, c, g, d, e, f,
+ g, c, e, -f, -g, -a, -b, -d,
+ g, d, -e, -a, -g, f, b, c,
+ g, f, -b, -d, g, c, -e, -a,
+ g, -f, -b, d, g, -c, -e, a,
+ g, -d, -e, a, -g, -f, b, -c,
+ g, -c, e, f, -g, a, -b, d,
+ g, -a, b, -c, g, -d, e, f
+ };
+
+ parseArgs(argc, argv, &args);
+
+ cl_device_id device = getDeviceID(args.d);
+#if TEST_SIMD8
+ ker_path = do_kiss_path("dct_kernels_0.bin8", device);
+#else
+ ker_path = do_kiss_path("dct_kernels_0.bin", device);
+#endif
+ cl_context context = clCreateContext(0, 1, &device, NULL, NULL, &err); CHK_ERR(err);
+ cl_command_queue queue = clCreateCommandQueue(context, device, 0, &err); CHK_ERR(err);
+ cl_kernel kernel = getKernelFromBinary(device, context, ker_path, "DCT");
+
+ int bytes = width * height * sizeof(cl_float);
+ input = newBuffer(bytes, 0);
+ output = newBuffer(bytes, 0);
+
+ for (i = 0; i < width*height; i++)
+ input[i] = randf() * 255.0f;
+
+ cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bytes, input, &err);
+ CHK_ERR(err);
+ cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, bytes, output, &err);
+ CHK_ERR(err);
+ cl_mem dctBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * blockSize, dct8x8, &err);
+ CHK_ERR(err);
+
+ /* Execute */
+ /*** Set appropriate arguments to the kernel ***/
+ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &outputBuffer); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &inputBuffer); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &dctBuffer); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 3, blockWidth * blockWidth * sizeof(cl_float), NULL); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 4, sizeof(cl_uint), &width); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 5, sizeof(cl_uint), &blockWidth); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 6, sizeof(cl_uint), &inverse); CHK_ERR(err);
+
+ size_t globalThreads[2] = {width, height};
+ size_t localThreads[2] = {blockWidth, blockWidth};
+
+ err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL);
+ CHK_ERR(err);
+
+ output = clIntelMapBuffer(outputBuffer, &err);
+ verify(dct8x8);
+ return 0;
+}
+
+/*
+ * Reference implementation of the Discrete Cosine Transfrom on the CPU
+ */
+ cl_uint
+getIdx(cl_uint blockIdx, cl_uint blockIdy, cl_uint localIdx, cl_uint localIdy, cl_uint blockWidth, cl_uint globalWidth)
+{
+ cl_uint globalIdx = blockIdx * blockWidth + localIdx;
+ cl_uint globalIdy = blockIdy * blockWidth + localIdy;
+
+ return (globalIdy * globalWidth + globalIdx);
+}
+
+ void
+DCTCPUReference(cl_float *ref,
+ const cl_float * input ,
+ const cl_float * dct8x8 ,
+ const cl_uint width,
+ const cl_uint height,
+ const cl_uint numBlocksX,
+ const cl_uint numBlocksY,
+ const cl_uint inverse)
+{
+ cl_float * temp = (cl_float *)malloc(width*height*sizeof(cl_float));
+
+ /* for each block in the image */
+ cl_uint blockIdy, blockIdx, i,j, k;
+ for(blockIdy=0; blockIdy < numBlocksY; ++blockIdy)
+ for(blockIdx=0; blockIdx < numBlocksX; ++blockIdx)
+ {
+ // First calculate A^T * X
+ for(j=0; j < blockWidth ; ++j)
+ for(i = 0; i < blockWidth ; ++i)
+ {
+ cl_uint index = getIdx(blockIdx, blockIdy, i, j, blockWidth, width);
+ cl_float tmp = 0.0f;
+ for(k=0; k < blockWidth; ++k)
+ {
+ // multiply with dct8x8(k,i) if forward DCT and dct8x8(i,k) if inverse DCT
+ cl_uint index1 = (inverse) ? i*blockWidth +k : k*blockWidth + i;
+ //input(k,j)
+ cl_uint index2 = getIdx(blockIdx, blockIdy, j, k, blockWidth, width);
+
+ tmp += dct8x8[index1]*input[index2];
+ }
+ temp[index] = tmp;
+ //ref[index] = tmp;
+ }
+ // And now multiply the result of previous step with A i.e. calculate (A^T * X) * A
+ for(j=0; j < blockWidth ; ++j)
+ for(i = 0; i < blockWidth ; ++i)
+ {
+ cl_uint index = getIdx(blockIdx, blockIdy, i, j, blockWidth, width);
+ cl_float tmp = 0.0f;
+ for(k=0; k < blockWidth; ++k)
+ {
+ //input(i,k)
+ cl_uint index1 = getIdx(blockIdx, blockIdy, k, i, blockWidth, width);
+
+ // multiply with dct8x8(k,j) if forward DCT and dct8x8(j,k) if inverse DCT
+ cl_uint index2 = (inverse) ? j*blockWidth +k : k*blockWidth + j;
+
+ tmp += temp[index1]*dct8x8[index2];
+ }
+ ref[index] = tmp;
+ }
+ }
+ free(temp);
+}
+
+void verify(float* dct8x8)
+{
+ printf("Passed\n");
+ ref = (cl_float*) newBuffer(width*height*sizeof(cl_float), 0);
+ DCTCPUReference(ref, input, dct8x8, width, height, width/blockWidth, height/blockWidth, inverse);
+#if 1
+ int i, j;
+ for (j=0; j<height; j++, printf("\n"))
+ for (i=0; i<width; i++) {
+ printf("[%4.4f %4.4f]", output[i+j*width], ref[i+j*width]);
+ }
+#endif
+ int resC = comparef(output, ref, width * height, 1.0e-6f);
+ printf("%d\n", resC);
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "common.h"
+#include "cl_test.h"
+
+void verify();
+
+int length = 2048;
+float *input, *output, *refout;
+
+int main(int argc, char**argv)
+{
+ struct args args = {0};
+ int err;
+ char *ker_path = NULL;
+
+ parseArgs(argc, argv, &args);
+
+ cl_device_id device = getDeviceID(args.d);
+#if TEST_SIMD8
+ ker_path = do_kiss_path("fastWalsh_kernels_0.bin8", device);
+#else
+ ker_path = do_kiss_path("fastWalsh_kernels_0.bin", device);
+#endif
+ cl_context context = clCreateContext(0, 1, &device, NULL, NULL, &err); CHK_ERR(err);
+ cl_command_queue queue = clCreateCommandQueue(context, device, 0, &err); CHK_ERR(err);
+ cl_kernel kernel = getKernelFromBinary(device, context, ker_path, "fastWalshTransform");
+
+ input = newBuffer(length * sizeof(float), 'p');
+ refout = newBuffer(length * sizeof(float), 0 );
+ memcpy(refout, input, length * sizeof(float));
+
+ cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, length * sizeof(float), input, &err); CHK_ERR(err);
+
+ /* Execute */
+ size_t globalThreads[1] = { length/2};
+ size_t localThreads[1] = { 256};
+
+ /* the input array - also acts as output*/
+ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer); CHK_ERR(err);
+
+ int step;
+
+ for(step = 1; step < length; step<<= 1) {
+ /* stage of the algorithm */
+ err = clSetKernelArg( kernel, 1, sizeof(int), &step); CHK_ERR(err);
+
+ /* Enqueue a kernel run call */
+ err = clEnqueueNDRangeKernel(queue,
+ kernel, 1, NULL,
+ globalThreads,
+ localThreads,
+ 0,
+ NULL,
+ NULL); CHK_ERR(err);
+
+ /* wait for the kernel call to finish execution */
+ /* err = clWaitForEvents(1, &eND); CHK_ERR(err); */
+ }
+
+#if 0
+ /* Enqueue readBuffer*/
+ err = clEnqueueReadBuffer(queue, inputBuffer, CL_TRUE, 0, length * sizeof(cl_float), output, 0, NULL, NULL); CHK_ERR(err);
+#else
+ output = clIntelMapBuffer(inputBuffer, &err);
+
+#endif
+ verify();
+ return 0;
+}
+
+void fastWalshCPU(float *vinput, int length)
+{
+ int step;
+
+ /* for each pass of the algorithm */
+ for(step=1; step < length; step <<=1) {
+ /* length of each block */
+ cl_uint jump = step << 1;
+
+ /* for each blocks */
+ cl_uint group;
+ for(group = 0; group < step; ++group) {
+ /* for each pair of elements with in the block */
+ cl_uint pair;
+
+ for(pair = group; pair < length; pair += jump) {
+ /* find its partner */
+ cl_uint match = pair + step;
+
+ cl_float T1 = vinput[pair];
+ cl_float T2 = vinput[match];
+
+ /* store the sum and difference of the numbers in the same locations */
+ vinput[pair] = T1 + T2;
+ vinput[match] = T1 - T2;
+ }
+ }
+ }
+}
+
+void verify()
+{
+ int i;
+
+ fastWalshCPU(refout, length);
+ comparef(output, refout, length, 1.0e-6);
+ for (i = 0; i < 20; i++) {
+ printf("%20.5f %20.8f\n", output[i], refout[i]);
+ }
+ printf("Done\n");
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "common.h"
+#include "cl_test.h"
+
+void verify();
+
+int length = 1024;
+float *input_i, *input_r, *output_i, *output_r, *refOutput_r, *refOutput_i;
+
+int main(int argc, char**argv)
+{
+ char *ker_path = NULL;
+ struct args args = {0};
+ int err;
+
+ parseArgs(argc, argv, &args);
+
+ cl_device_id device = getDeviceID(args.d);
+#if TEST_SIMD8
+ ker_path = do_kiss_path("fft_kernels_0.bin", device);
+#else
+ ker_path = do_kiss_path("fft_kernels_0.bin8", device);
+#endif
+ cl_context context = clCreateContext(0, 1, &device, NULL, NULL, &err); CHK_ERR(err);
+ cl_command_queue queue = clCreateCommandQueue(context, device, 0, &err); CHK_ERR(err);
+ cl_kernel kernel = getKernelFromBinary(device, context, ker_path, "kfft");
+
+ cl_uint inputSizeBytes = length * sizeof(cl_float);
+
+ /* allocate and init memory used by host */
+ input_r = newBuffer(inputSizeBytes, 'p');
+ input_i = newBuffer(inputSizeBytes, 0 );
+ output_i = newBuffer(inputSizeBytes, 0 );
+ output_r = newBuffer(inputSizeBytes, 0 );
+
+ memcpy(output_i, input_i, inputSizeBytes);
+ memcpy(output_r, input_r, inputSizeBytes);
+
+ cl_mem buffer_r = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, inputSizeBytes, input_r, &err); CHK_ERR(err);
+ cl_mem buffer_i = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, inputSizeBytes, input_i, &err); CHK_ERR(err);
+
+ size_t globalThreads[1] = {64};
+ size_t localThreads[1] = {64};
+
+ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &buffer_r); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &buffer_i); CHK_ERR(err);
+
+ err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL);
+ CHK_ERR(err);
+
+ output_i = clIntelMapBuffer(buffer_i, &err); CHK_ERR(err);
+ output_r = clIntelMapBuffer(buffer_r, &err); CHK_ERR(err);
+ verify();
+
+ return 0;
+}
+
+/*
+ This computes an in-place complex-to-complex FFT
+ x and y are the real and imaginary arrays of 2^m points.
+ dir = 1 gives forward transform
+ dir = -1 gives reverse transform
+ */
+void fftCPU(int dir,long m,cl_float *x,cl_float *y)
+{
+ long n,i,i1,j,k,i2,l,l1,l2;
+ double c1,c2,tx,ty,t1,t2,u1,u2,z;
+
+ /* Calculate the number of points */
+ n = 1;
+ for (i=0;i<m;i++)
+ n *= 2;
+
+ /* Do the bit reversal */
+ i2 = n >> 1;
+ j = 0;
+ for (i=0;i<n-1;i++)
+ {
+ if (i < j)
+ {
+ tx = x[i];
+ ty = y[i];
+ x[i] = x[j];
+ y[i] = y[j];
+ x[j] = (cl_float)tx;
+ y[j] = (cl_float)ty;
+ }
+ k = i2;
+ while (k <= j)
+ {
+ j -= k;
+ k >>= 1;
+ }
+ j += k;
+ }
+
+ /* Compute the FFT */
+ c1 = -1.0;
+ c2 = 0.0;
+ l2 = 1;
+ for (l=0;l<m;l++)
+ {
+ l1 = l2;
+ l2 <<= 1;
+ u1 = 1.0;
+ u2 = 0.0;
+ for (j=0;j<l1;j++)
+ {
+ for (i=j;i<n;i+=l2)
+ {
+ i1 = i + l1;
+ t1 = u1 * x[i1] - u2 * y[i1];
+ t2 = u1 * y[i1] + u2 * x[i1];
+ x[i1] = (cl_float)(x[i] - t1);
+ y[i1] = (cl_float)(y[i] - t2);
+ x[i] += (cl_float)t1;
+ y[i] += (cl_float)t2;
+ }
+ z = u1 * c1 - u2 * c2;
+ u2 = u1 * c2 + u2 * c1;
+ u1 = z;
+ }
+ c2 = sqrt((1.0 - c1) / 2.0);
+ if (dir == 1)
+ c2 = -c2;
+ c1 = sqrt((1.0 + c1) / 2.0);
+ }
+
+ /* Scaling for forward transform */
+ /*if (dir == 1) {
+ for (i=0;i<n;i++) {
+ x[i] /= n;
+ y[i] /= n;
+ }
+ }*/
+}
+
+/**
+ * Reference CPU implementation of FFT Convolution
+ * for performance comparison
+ */
+void fftCPUReference()
+{
+ refOutput_r = newBuffer(length * sizeof(float), 0 );
+ refOutput_i = newBuffer(length * sizeof(float), 0 );
+
+ /* Copy data from input to reference buffers */
+ memcpy(refOutput_r, input_r, length * sizeof(cl_float));
+ memcpy(refOutput_i, input_i, length * sizeof(cl_float));
+
+ /* Compute reference FFT */
+ fftCPU(1, 10, refOutput_r, refOutput_i);
+}
+
+void verify()
+{
+ fftCPUReference();
+ int i;
+ comparef(refOutput_r, output_r, length, 1.0e-5f);
+ comparef(refOutput_i, output_i, length, 1.0e-5f);
+ printf("Passed\n");
+
+ for (i = 0; i < length; i++) {
+ printf("%i %12.5f, %12.5f | %12.5f, %12.5f | %12.5f, %12.5f\n",
+ i, input_r[i], input_i[i],
+ refOutput_r[i], output_r[i],
+ refOutput_i[i], output_i[i]);
+ }
+ printf("...\n");
+
+ for (i = 0; i < 8; i++) {
+ printf("%12.5f, %12.5f | %12.5f, %12.5f | %12.5f, %12.5f\n",
+ input_r[length-1-i], input_i[length-1-i],
+ refOutput_r[length-1-i], output_r[length-1-i],
+ refOutput_i[length-1-i], output_i[length-1-i]);
+ }
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "cl_test.h"
+
+static int *dst = NULL;
+static const size_t w = 64;
+static const size_t h = 64;
+static const size_t iter = 4;
+
+int
+main(int argc, char **argv)
+{
+ const size_t global[2] = {w, h};
+ const size_t local[2] = {16, 1};
+ const size_t sz = w * h * sizeof(char[4]);
+ int status = 0;
+
+#if TEST_SIMD8
+ if ((status = cl_test_init("mandelbrot_0.bin8", "render")) != 0)
+ goto error;
+#else
+ if ((status = cl_test_init("mandelbrot_0.bin", "render")) != 0)
+ goto error;
+#endif
+
+ /* One dry run */
+ cl_mem cl_dst = clCreateBuffer(ctx, CL_MEM_PINNABLE, sz, NULL, &status);
+
+ /* Set source and destination */
+ CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &cl_dst);
+
+ /* Run the kernel */
+ CALL (clEnqueueNDRangeKernel, queue, kernel, 2, NULL, global, local, 0, NULL, NULL);
+ dst = (int *) clIntelMapBuffer(cl_dst, &status);
+
+ writeBmp(dst, w, h, "mandelbrot.bmp");
+ CALL (clIntelUnmapBuffer, cl_dst);
+ CALL (clReleaseMemObject, cl_dst);
+ cl_test_destroy();
+ printf("%i memory leaks\n", clIntelReportUnfreed());
+
+error:
+ cl_report_error(status);
+ return status;
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "common.h"
+void verify();
+
+float *input0, *input1, *output, *refout;
+int vectorsize = 4;
+int blockSize = 8;
+
+/* must be multiples of vectorsize and blocksize */
+#define M 32
+#define N 32
+#define K 32
+
+int width0 = M;
+int height0 = N;
+
+int width1 = K;
+int height1 = M;
+
+int main(int argc, char**argv)
+{
+ struct args args = {0};
+ char *ker_path = NULL;
+ int err;
+
+ parseArgs(argc, argv, &args);
+
+ cl_device_id device = getDeviceID(args.d);
+ ker_path = do_kiss_path("matmul_kernels_0.bin", device);
+ cl_context context = clCreateContext(0, 1, &device, NULL, NULL, &err); CHK_ERR(err);
+ cl_command_queue queue = clCreateCommandQueue(context, device, 0, &err); CHK_ERR(err);
+ cl_kernel kernel = getKernelFromBinary(device, context, ker_path, "mmmKernel");
+
+ input0 = (cl_float *) newBuffer(width0 * height0 * sizeof(cl_float), 'p');
+ input1 = (cl_float *) newBuffer(width1 * height1 * sizeof(cl_float), 'p');
+ output = (cl_float *) newBuffer(width1 * height0 * sizeof(cl_float), 0 );
+ refout = (cl_float *) newBuffer(width1 * height0 * sizeof(cl_float), 0 );
+
+ cl_mem inputBuffer0 = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+ width0 * height0 * sizeof(cl_float), input0, &err); CHK_ERR(err);
+ cl_mem inputBuffer1 = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+ width1 * height1 * sizeof(cl_float), input1, &err); CHK_ERR(err);
+ cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR,
+ width1 * height0 * sizeof(cl_float), output, &err); CHK_ERR(err);
+
+ /* Execute */
+ size_t globalThreads[2] = {width1 / 4, height0/ 4};
+ size_t localThreads [2] = {blockSize, blockSize};
+
+ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer0); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &inputBuffer1); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &outputBuffer); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 3, sizeof(cl_int), &width0); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 4, sizeof(cl_int), &width1); CHK_ERR(err);
+
+ err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL);
+ output = clIntelMapBuffer(outputBuffer, &err);
+
+ verify();
+ clIntelUnmapBuffer(outputBuffer);
+ return err;
+}
+
+void
+matmulCPU(float *output, float *input0, float * input1, int y, int x, int z)
+{
+ int i, j, k;
+
+ for(i=0; i < y; i++)
+ for(j=0; j < z; j++)
+ for(k=0; k < x; k++)
+ output[i*z + j] += (input0[i*x + k]*input1[k*z + j]);
+}
+
+void verify()
+{
+ matmulCPU(refout, input0, input1, height0, width0, width1);
+ comparef(refout, output, width1 * height0, 1.0e-5f );
+ printf("Done\n");
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "common.h"
+#include "cl_test.h"
+#include "CL/cl_intel.h"
+
+void verify();
+
+int width = 256;
+int height = 256;
+int blockSizeX = 8;
+int blockSizeY = 8;
+int mulFactor = 2;
+float *seeds, *deviceResult;
+
+int main(int argc, char**argv)
+{
+ char *ker_path = NULL;
+ struct args args = {0};
+ int err, i;
+
+ parseArgs(argc, argv, &args);
+
+ cl_device_id device = getDeviceID(args.d);
+#if TEST_SIMD8
+ ker_path = do_kiss_path("mersenne_kernels_0.bin8", device);
+#else
+ ker_path = do_kiss_path("mersenne_kernels_0.bin", device);
+#endif
+ cl_context context = clCreateContext(0, 1, &device, NULL, NULL, &err); CHK_ERR(err);
+ cl_command_queue queue = clCreateCommandQueue(context, device, 0, &err); CHK_ERR(err);
+ cl_kernel kernel = getKernelFromBinary(device, context, ker_path, "gaussianRand");
+
+ seeds = newBuffer( width * height * sizeof(cl_uint4), 0);
+// deviceResult = newBuffer( width * height * mulFactor * sizeof(cl_float4), 0);
+ for (i = 0; i < width * height * 4; ++i) {
+ seeds[i] = (unsigned int)rand();
+ }
+
+ cl_mem seedsBuf = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, width * height * sizeof(cl_float4), seeds, &err); CHK_ERR(err);
+ cl_mem resultBuf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, width * height * sizeof(cl_float4) * mulFactor, NULL, &err); CHK_ERR(err);
+
+ size_t globalThreads[2] = {width, height};
+ size_t localThreads[2] = {blockSizeX, blockSizeY};
+
+ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &seedsBuf); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 1, sizeof(cl_uint), &width); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 2, sizeof(cl_uint), &mulFactor); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 3, sizeof(cl_mem), &resultBuf); CHK_ERR(err);
+
+ err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL);
+#if 0
+ err = clEnqueueReadBuffer(queue, resultBuf, CL_TRUE, 0, width * height * mulFactor * sizeof(cl_float4),
+ deviceResult, 1, &eND, NULL);
+#endif
+ deviceResult = clIntelMapBuffer(resultBuf, &err);
+ CHK_ERR(err);
+ verify();
+ clIntelUnmapBuffer(resultBuf);
+
+ return err;
+}
+
+void verify()
+{
+ int i;
+ // comparef(d, c, MAX, 1.0e-6);
+ /* check mean value of generated random numbers */
+ float meanVal = 0.0f;
+
+ for(i = 0; i < height * width * (int)mulFactor * 4; ++i) {
+ meanVal += deviceResult[i];
+ }
+
+ meanVal = fabs(meanVal) / (height * width * (int)mulFactor * 4);
+ printf("Mean Value of random numbers: %12.8f\n", meanVal);
+ printf("%s\n", (meanVal < 0.1f) ? "Passed" : "Failed");
+
+ printf("Done\n");
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "common.h"
+void verify();
+int width = 64;
+int height = 64;
+int blockSizeX = 256;
+int blockSizeY = 1;
+
+
+cl_float *sigma; /**< Array of sigma values */
+cl_float *price; /**< Array of price values */
+cl_float *vega; /**< Array of vega values */
+cl_float *refPrice; /**< Array of reference price values */
+cl_float *refVega; /**< Array of reference vega values */
+cl_uint *randNum; /**< Array of random numbers */
+cl_float *priceVals; /**< Array of price values for given samples */
+cl_float *priceDeriv; /**< Array of price derivative values for given samples */
+cl_int steps = 10;
+
+ typedef struct _MonteCalroAttrib
+ {
+ cl_float4 strikePrice;
+ cl_float4 c1;
+ cl_float4 c2;
+ cl_float4 c3;
+ cl_float4 initPrice;
+ cl_float4 sigma;
+ cl_float4 timeStep;
+ }MonteCarloAttrib;
+
+ float maturity = 1.f;
+ int noOfSum = 12;
+ int noOfTraj = 1024;
+ float initPrice = 50.f;
+ float strikePrice = 55.f;
+ float interest = 0.06f;
+
+int main(int argc, char**argv)
+{
+ struct args args = {0};
+ int err, i;
+ const cl_float finalValue = 0.8f;
+ const cl_float stepValue = finalValue / (cl_float)steps;
+
+ parseArgs(argc, argv, &args);
+
+ cl_device_id device = getDeviceID(args.d);
+ cl_context context = clCreateContext(0, 1, &device, NULL, NULL, &err); CHK_ERR(err);
+ cl_command_queue queue = clCreateCommandQueue(context, device, 0, &err); CHK_ERR(err);
+ cl_kernel kernel = getKernel(device, context, "monteCarloAsian_kernels.cl", "calPriceVega");
+
+ sigma = (cl_float*) newBuffer(steps * sizeof(cl_float), 0);
+ sigma[0] = 0.01f;
+ for (i = 1; i < steps; i++) {
+ sigma[i] = sigma[i - 1] + stepValue;
+ }
+
+ price = (cl_float*) newBuffer(steps * sizeof(cl_float), 0);
+ vega = (cl_float*) newBuffer(steps * sizeof(cl_float), 0);
+ refPrice = (cl_float*) newBuffer(steps * sizeof(cl_float), 0);
+ refVega = (cl_float*) newBuffer(steps * sizeof(cl_float), 0);
+
+ /* Set samples and exercize points */
+
+ width = noOfTraj / 4;
+ height = noOfTraj / 2;
+
+ randNum = (cl_uint*) newBuffer(width * height * sizeof(cl_uint4), 0);
+ priceVals = (cl_float*) newBuffer(width * height * 2 * sizeof(cl_float4), 0);
+ priceDeriv = (cl_float*) newBuffer(width * height * 2 * sizeof(cl_float4), 0);
+
+ cl_mem randBuf = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(cl_uint4) * width * height, randNum, &err);
+ CHK_ERR(err);
+
+ cl_mem priceBuf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float4) * width * height * 2, NULL, &err);
+ CHK_ERR(err);
+
+ cl_mem priceDerivBuf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float4) * width * height * 2, NULL, &err);
+ CHK_ERR(err);
+
+ cl_event eND;
+ size_t globalThreads[2] = {width, height};
+ size_t localThreads[2] = {blockSizeX, blockSizeY};
+
+ MonteCarloAttrib attributes;
+
+ err = clSetKernelArg(kernel, 2, sizeof(cl_uint), &width); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 3, sizeof(cl_mem), &randBuf); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 4, sizeof(cl_mem), &priceBuf); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 5, sizeof(cl_mem), &priceDerivBuf); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 1, sizeof(cl_int), &noOfSum); CHK_ERR(err);
+
+ float timeStep = maturity / (noOfSum - 1);
+ // Initialize random number generator
+ srand(1);
+
+ int k, j;
+ for(k = 0; k < steps; k++) {
+ for(j = 0; j < (width * height * 4); j++) {
+ randNum[j] = (cl_uint)rand();
+ }
+
+ float c1 = (interest - 0.5f * sigma[k] * sigma[k]) * timeStep;
+ float c2 = sigma[k] * sqrt(timeStep);
+ float c3 = (interest + 0.5f * sigma[k] * sigma[k]);
+
+ const cl_float4 c1F4 = {c1, c1, c1, c1};
+ attributes.c1 = c1F4;
+
+ const cl_float4 c2F4 = {c2, c2, c2, c2};
+ attributes.c2 = c2F4;
+
+ const cl_float4 c3F4 = {c3, c3, c3, c3};
+ attributes.c3 = c3F4;
+
+ cl_float4 initPriceF4 = {initPrice, initPrice, initPrice, initPrice};
+ attributes.initPrice = initPriceF4;
+
+ const cl_float4 strikePriceF4 = {strikePrice, strikePrice, strikePrice, strikePrice};
+ attributes.strikePrice = strikePriceF4;
+
+ const cl_float4 sigmaF4 = {sigma[k], sigma[k], sigma[k], sigma[k]};
+ attributes.sigma = sigmaF4;
+
+ const cl_float4 timeStepF4 = {timeStep, timeStep, timeStep, timeStep};
+ attributes.timeStep = timeStepF4;
+
+
+ /* Set appropriate arguments to the kernel */
+ /* the input array - also acts as output for this pass (input for next) */
+ err = clSetKernelArg(kernel, 0, sizeof(attributes), &attributes);
+ CHK_ERR(err);
+
+ /*
+ * Enqueue a kernel run call.
+ */
+ err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalThreads, localThreads,
+ 0, NULL, &eND);
+ CHK_ERR(err);
+
+ /* Enqueue the results to application pointer*/
+ err = clEnqueueReadBuffer(queue, priceBuf, CL_TRUE, 0, width * height * 2 * sizeof(cl_float4), priceVals,
+ 1, &eND, NULL);
+ CHK_ERR(err);
+
+ /* Enqueue the results to application pointer*/
+ err = clEnqueueReadBuffer(queue, priceDerivBuf, CL_TRUE, 0, width * height * 2 * sizeof(cl_float4), priceDeriv,
+ 0, NULL, NULL);
+ CHK_ERR(err);
+
+ /* Replace Following "for" loop with reduction kernel */
+ for (i = 0; i < noOfTraj * noOfTraj; i++) {
+ price[k] += priceVals[i];
+ vega[k] += priceDeriv[i];
+ }
+
+ price[k] /= (noOfTraj * noOfTraj);
+ vega[k] /= (noOfTraj * noOfTraj);
+
+ price[k] = exp(-interest * maturity) * price[k];
+ vega[k] = exp(-interest * maturity) * vega[k];
+ }
+ verify();
+}
+
+void
+lshift128(unsigned int* input,
+ unsigned int shift,
+ unsigned int * output)
+{
+ unsigned int invshift = 32u - shift;
+
+ output[0] = input[0] << shift;
+ output[1] = (input[1] << shift) | (input[0] >> invshift);
+ output[2] = (input[2] << shift) | (input[1] >> invshift);
+ output[3] = (input[3] << shift) | (input[2] >> invshift);
+}
+
+void
+rshift128(unsigned int* input,
+ unsigned int shift,
+ unsigned int* output)
+{
+ unsigned int invshift = 32u - shift;
+ output[3]= input[3] >> shift;
+ output[2] = (input[2] >> shift) | (input[0] >> invshift);
+ output[1] = (input[1] >> shift) | (input[1] >> invshift);
+ output[0] = (input[0] >> shift) | (input[2] >> invshift);
+}
+
+void
+generateRand(unsigned int* seed,
+ float *gaussianRand1,
+ float *gaussianRand2,
+ unsigned int* nextRand)
+{
+
+ unsigned int mulFactor = 4;
+ unsigned int temp[8][4];
+
+ unsigned int state1[4] = {seed[0], seed[1], seed[2], seed[3]};
+ unsigned int state2[4] = {0u, 0u, 0u, 0u};
+ unsigned int state3[4] = {0u, 0u, 0u, 0u};
+ unsigned int state4[4] = {0u, 0u, 0u, 0u};
+ unsigned int state5[4] = {0u, 0u, 0u, 0u};
+
+ unsigned int stateMask = 1812433253u;
+ unsigned int thirty = 30u;
+ unsigned int mask4[4] = {stateMask, stateMask, stateMask, stateMask};
+ unsigned int thirty4[4] = {thirty, thirty, thirty, thirty};
+ unsigned int one4[4] = {1u, 1u, 1u, 1u};
+ unsigned int two4[4] = {2u, 2u, 2u, 2u};
+ unsigned int three4[4] = {3u, 3u, 3u, 3u};
+ unsigned int four4[4] = {4u, 4u, 4u, 4u};
+
+ unsigned int r1[4] = {0u, 0u, 0u, 0u};
+ unsigned int r2[4] = {0u, 0u, 0u, 0u};
+
+ unsigned int a[4] = {0u, 0u, 0u, 0u};
+ unsigned int b[4] = {0u, 0u, 0u, 0u};
+
+ unsigned int e[4] = {0u, 0u, 0u, 0u};
+ unsigned int f[4] = {0u, 0u, 0u, 0u};
+
+ unsigned int thirteen = 13u;
+ unsigned int fifteen = 15u;
+ unsigned int shift = 8u * 3u;
+
+ unsigned int mask11 = 0xfdff37ffu;
+ unsigned int mask12 = 0xef7f3f7du;
+ unsigned int mask13 = 0xff777b7du;
+ unsigned int mask14 = 0x7ff7fb2fu;
+
+ const float one = 1.0f;
+ const float intMax = 4294967296.0f;
+ const float PI = 3.14159265358979f;
+ const float two = 2.0f;
+
+ float r[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+ float phi[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+
+ float temp1[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+ float temp2[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+
+ int c;
+
+ //Initializing states.
+ for(c = 0; c < 4; ++c) {
+ state2[c] = mask4[c] * (state1[c] ^ (state1[c] >> thirty4[c])) + one4[c];
+ state3[c] = mask4[c] * (state2[c] ^ (state2[c] >> thirty4[c])) + two4[c];
+ state4[c] = mask4[c] * (state3[c] ^ (state3[c] >> thirty4[c])) + three4[c];
+ state5[c] = mask4[c] * (state4[c] ^ (state4[c] >> thirty4[c])) + four4[c];
+ }
+
+ unsigned int i = 0;
+ for(i = 0; i < mulFactor; ++i) {
+ switch(i)
+ {
+ case 0:
+ for(c = 0; c < 4; ++c)
+ {
+ r1[c] = state4[c];
+ r2[c] = state5[c];
+ a[c] = state1[c];
+ b[c] = state3[c];
+ }
+ break;
+ case 1:
+ for(c = 0; c < 4; ++c)
+ {
+ r1[c] = r2[c];
+ r2[c] = temp[0][c];
+ a[c] = state2[c];
+ b[c] = state4[c];
+ }
+ break;
+ case 2:
+ for(c = 0; c < 4; ++c)
+ {
+ r1[c] = r2[c];
+ r2[c] = temp[1][c];
+ a[c] = state3[c];
+ b[c] = state5[c];
+ }
+ break;
+ case 3:
+ for(c = 0; c < 4; ++c)
+ {
+ r1[c] = r2[c];
+ r2[c] = temp[2][c];
+ a[c] = state4[c];
+ b[c] = state1[c];
+ }
+ break;
+ default:
+ break;
+
+ }
+
+ lshift128(a, shift, e);
+ rshift128(r1, shift, f);
+
+ temp[i][0] = a[0] ^ e[0] ^ ((b[0] >> thirteen) & mask11) ^ f[0] ^ (r2[0] << fifteen);
+ temp[i][1] = a[1] ^ e[1] ^ ((b[1] >> thirteen) & mask12) ^ f[1] ^ (r2[1] << fifteen);
+ temp[i][2] = a[2] ^ e[2] ^ ((b[2] >> thirteen) & mask13) ^ f[2] ^ (r2[2] << fifteen);
+ temp[i][3] = a[3] ^ e[3] ^ ((b[3] >> thirteen) & mask14) ^ f[3] ^ (r2[3] << fifteen);
+
+ }
+
+ for(c = 0; c < 4; ++c) {
+ temp1[c] = temp[0][c] * one / intMax;
+ temp2[c] = temp[1][c] * one / intMax;
+ }
+
+ for(c = 0; c < 4; ++c) {
+ // Applying Box Mullar Transformations.
+ r[c] = sqrt((-two) * log(temp1[c]));
+ phi[c] = two * PI * temp2[c];
+ gaussianRand1[c] = r[c] * cos(phi[c]);
+ gaussianRand2[c] = r[c] * sin(phi[c]);
+
+ nextRand[c] = temp[2][c];
+ }
+}
+
+void
+calOutputs(float strikePrice,
+ float* meanDeriv1,
+ float* meanDeriv2,
+ float* meanPrice1,
+ float* meanPrice2,
+ float* pathDeriv1,
+ float* pathDeriv2,
+ float* priceVec1,
+ float* priceVec2)
+{
+ float temp1[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+ float temp2[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+ float temp3[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+ float temp4[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+
+ float tempDiff1[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+ float tempDiff2[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+
+ int c;
+
+ for(c = 0; c < 4; ++c)
+ {
+ tempDiff1[c] = meanPrice1[c] - strikePrice;
+ tempDiff2[c] = meanPrice2[c] - strikePrice;
+ }
+ if(tempDiff1[0] > 0.0f)
+ {
+ temp1[0] = 1.0f;
+ temp3[0] = tempDiff1[0];
+ }
+ if(tempDiff1[1] > 0.0f)
+ {
+ temp1[1] = 1.0f;
+ temp3[1] = tempDiff1[1];
+ }
+ if(tempDiff1[2] > 0.0f)
+ {
+ temp1[2] = 1.0f;
+ temp3[2] = tempDiff1[2];
+ }
+ if(tempDiff1[3] > 0.0f)
+ {
+ temp1[3] = 1.0f;
+ temp3[3] = tempDiff1[3];
+ }
+
+ if(tempDiff2[0] > 0.0f)
+ {
+ temp2[0] = 1.0f;
+ temp4[0] = tempDiff2[0];
+ }
+ if(tempDiff2[1] > 0.0f)
+ {
+ temp2[1] = 1.0f;
+ temp4[1] = tempDiff2[1];
+ }
+ if(tempDiff2[2] > 0.0f)
+ {
+ temp2[2] = 1.0f;
+ temp4[2] = tempDiff2[2];
+ }
+ if(tempDiff2[3] > 0.0f)
+ {
+ temp2[3] = 1.0f;
+ temp4[3] = tempDiff2[3];
+ }
+
+ for(c = 0; c < 4; ++c) {
+ pathDeriv1[c] = meanDeriv1[c] * temp1[c];
+ pathDeriv2[c] = meanDeriv2[c] * temp2[c];
+ priceVec1[c] = temp3[c];
+ priceVec2[c] = temp4[c];
+ }
+}
+
+void cpuRef()
+{
+
+ float timeStep = maturity / (noOfSum - 1);
+
+ // Initialize random number generator
+ srand(1);
+ int i, j, k, c;
+
+ for(k = 0; k < steps; k++) {
+ float c1 = (interest - 0.5f * sigma[k] * sigma[k]) * timeStep;
+ float c2 = sigma[k] * sqrt(timeStep);
+ float c3 = (interest + 0.5f * sigma[k] * sigma[k]);
+
+ for(j = 0; j < (width * height); j++) {
+ unsigned int nextRand[4] = {0u, 0u, 0u, 0u};
+ for (c = 0; c < 4; ++c)
+ nextRand[c] = (cl_uint)rand();
+
+ float trajPrice1[4] = {initPrice, initPrice, initPrice, initPrice};
+ float sumPrice1[4] = {initPrice, initPrice, initPrice, initPrice};
+ float sumDeriv1[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+ float meanPrice1[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+ float meanDeriv1[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+ float price1[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+ float pathDeriv1[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+
+ float trajPrice2[4] = {initPrice, initPrice, initPrice, initPrice};
+ float sumPrice2[4] = {initPrice, initPrice, initPrice, initPrice};
+ float sumDeriv2[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+ float meanPrice2[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+ float meanDeriv2[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+ float price2[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+ float pathDeriv2[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+
+ //Run the Monte Carlo simulation a total of Num_Sum - 1 times
+ for(i = 1; i < noOfSum; i++) {
+ unsigned int tempRand[4] = {0u, 0u, 0u, 0u};
+ for(c = 0; c < 4; ++c)
+ tempRand[c] = nextRand[c];
+
+ float gaussian1[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+ float gaussian2[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+ generateRand(tempRand, gaussian1, gaussian2, nextRand);
+
+ //Calculate the trajectory price and sum price for all trajectories
+ for(c = 0; c < 4; ++c) {
+ trajPrice1[c] = trajPrice1[c] * exp(c1 + c2 * gaussian1[c]);
+ trajPrice2[c] = trajPrice2[c] * exp(c1 + c2 * gaussian2[c]);
+
+ sumPrice1[c] = sumPrice1[c] + trajPrice1[c];
+ sumPrice2[c] = sumPrice2[c] + trajPrice2[c];
+
+ float temp = c3 * timeStep * i;
+
+ // Calculate the derivative price for all trajectories
+ sumDeriv1[c] = sumDeriv1[c] + trajPrice1[c]
+ * ((log(trajPrice1[c] / initPrice) - temp) / sigma[k]);
+
+ sumDeriv2[c] = sumDeriv2[c] + trajPrice2[c]
+ * ((log(trajPrice2[c] / initPrice) - temp) / sigma[k]);
+ }
+ }
+
+ //Calculate the average price and \93average derivative\94 of each simulated path
+ for(c = 0; c < 4; ++c) {
+ meanPrice1[c] = sumPrice1[c] / noOfSum;
+ meanPrice2[c] = sumPrice2[c] / noOfSum;
+ meanDeriv1[c] = sumDeriv1[c] / noOfSum;
+ meanDeriv2[c] = sumDeriv2[c] / noOfSum;
+ }
+
+ calOutputs(strikePrice, meanDeriv1, meanDeriv2, meanPrice1, meanPrice2,
+ pathDeriv1, pathDeriv2, price1, price2);
+
+ for(c = 0; c < 4; ++c) {
+ priceVals[j * 8 + c] = price1[c];
+ priceVals[j * 8 + 1 * 4 + c] = price2[c];
+ priceDeriv[j * 8 + c] = pathDeriv1[c];
+ priceDeriv[j * 8 + 1 * 4 + c] = pathDeriv2[c];
+ }
+ }
+
+ /* Replace Following "for" loop with reduction kernel */
+ for(i = 0; i < noOfTraj * noOfTraj; i++) {
+ refPrice[k] += priceVals[i];
+ refVega[k] += priceDeriv[i];
+ }
+
+ refPrice[k] /= (noOfTraj * noOfTraj);
+ refVega[k] /= (noOfTraj * noOfTraj);
+
+ refPrice[k] = exp(-interest * maturity) * refPrice[k];
+ refVega[k] = exp(-interest * maturity) * refVega[k];
+ }
+
+ /* compare the results and see if they match */
+ for(i = 0; i < steps; ++i)
+ {
+ if(fabs(price[i] - refPrice[i]) > 0.2f)
+ {
+ printf("Failed\n");
+ exit(-1);
+ }
+ if(fabs(vega[i] - refVega[i]) > 0.2f)
+ {
+ printf("Failed\n");
+ exit(-1);
+ }
+ }
+ printf("Passed\n");
+}
+
+void verify()
+{
+ int i;
+ // comparef(d, c, MAX, 1.0e-6);
+ cpuRef();
+ printf("Done\n");
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "common.h"
+#include "cl_test.h"
+
+void verify();
+
+float *initPos, *initVel, *pos, *vel;
+int numBodies = 1024;
+float delT = 0.005f;
+float espSqr = 50.0f;
+int GROUP_SIZE = 256;
+int ITERATIONS = 1;
+
+int main(int argc, char**argv)
+{
+ char *ker_path = NULL;
+ struct args args = {0};
+ int err, i;
+
+ parseArgs(argc, argv, &args);
+
+ cl_device_id device = getDeviceID(args.d);
+#if TEST_SIMD8
+ ker_path = do_kiss_path("nbody_kernels_0.bin8", device);
+#else
+ ker_path = do_kiss_path("nbody_kernels_0.bin", device);
+#endif
+ cl_context context = clCreateContext(0, 1, &device, NULL, NULL, &err); CHK_ERR(err);
+ cl_command_queue queue = clCreateCommandQueue(context, device, 0, &err); CHK_ERR(err);
+ cl_kernel kernel = getKernelFromBinary(device, context, ker_path, "nbody_sim");
+
+ initPos = newBuffer(numBodies * sizeof(cl_float4), 0);
+ initVel = newBuffer(numBodies * sizeof(cl_float4), 0);
+ pos = newBuffer(numBodies * sizeof(cl_float4), 0);
+ vel = newBuffer(numBodies * sizeof(cl_float4), 0);
+
+ /* initialization of inputs */
+ for (i = 0; i < numBodies; ++i) {
+ int index = 4 * i;
+ int j;
+
+ // First 3 values are position in x,y and z direction
+ for(j = 0; j < 3; ++j) {
+ initPos[index + j] = randf2(3.0f, 50.0f);
+ }
+
+ // Mass value
+ initPos[index + 3] = randf2(1.0, 1000.0);
+
+ // First 3 values are velocity in x,y and z direction
+ for (j = 0; j < 3; ++j) {
+ initVel[index + j] = 0.0f;
+ }
+
+ // unused
+ initVel[3] = 0.0f;
+ }
+
+ memcpy(pos, initPos, 4 * numBodies * sizeof(cl_float));
+ memcpy(vel, initVel, 4 * numBodies * sizeof(cl_float));
+
+ cl_mem curPosBuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, numBodies * sizeof(cl_float4), pos, &err); CHK_ERR(err);
+ cl_mem newPosBuf = clCreateBuffer(context, CL_MEM_READ_WRITE, numBodies * sizeof(cl_float4), 0, &err); CHK_ERR(err);
+ cl_mem curVelBuf = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, numBodies * sizeof(cl_float4), vel, &err); CHK_ERR(err);
+ cl_mem newVelBuf = clCreateBuffer(context, CL_MEM_READ_WRITE, numBodies * sizeof(cl_float4), 0, &err); CHK_ERR(err);
+
+ /* Execute */
+ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &curPosBuf);
+ err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &curVelBuf);
+ err = clSetKernelArg(kernel, 2, sizeof(cl_int), &numBodies);
+ err = clSetKernelArg(kernel, 3, sizeof(cl_float), &delT);
+ err = clSetKernelArg(kernel, 4, sizeof(cl_float), &espSqr);
+ err = clSetKernelArg(kernel, 5, GROUP_SIZE * 4 * sizeof(float), NULL);
+ err = clSetKernelArg(kernel, 6, sizeof(cl_mem), &newPosBuf);
+ err = clSetKernelArg(kernel, 7, sizeof(cl_mem), &newVelBuf);
+
+ size_t globalThreads[] = {numBodies};
+ size_t localThreads[] = {GROUP_SIZE};
+
+ assert(ITERATIONS == 1);
+ for (i = 0; i < ITERATIONS; i++) {
+
+ err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL); CHK_ERR(err);
+
+ /* Copy data from new to old */
+ // err = clEnqueueCopyBuffer(queue, newPosBuf, curPosBuf, 0, 0, sizeof(cl_float4) * numBodies, 0, 0, 0); CHK_ERR(err);
+ // err = clEnqueueCopyBuffer(queue, newVelBuf, curVelBuf, 0, 0, sizeof(cl_float4) * numBodies, 0, 0, 0); CHK_ERR(err);
+ // err = clFinish(queue); CHK_ERR(err);
+
+ /* Enqueue readBuffer*/
+ /* Wait for the read buffer to finish execution */
+ // err = clEnqueueReadBuffer(queue, curPosBuf, CL_TRUE, 0, numBodies* sizeof(cl_float4), pos, 0, NULL, NULL); CHK_ERR(err);
+ }
+
+ pos = clIntelMapBuffer(newPosBuf, &err);
+ verify();
+ return 0;
+}
+
+void
+nBodyCPU(float* refPos, float* refVel)
+{
+ int i, j, k;
+
+ //Iterate for all samples
+ for(i = 0; i < numBodies; ++i) {
+ int myIndex = 4 * i;
+ float acc[3] = {0.0f, 0.0f, 0.0f};
+
+ for(j = 0; j < numBodies; ++j) {
+ float r[3];
+ int index = 4 * j;
+
+ float distSqr = 0.0f;
+ for(k = 0; k < 3; ++k) {
+ r[k] = refPos[index + k] - refPos[myIndex + k];
+ distSqr += r[k] * r[k];
+ }
+
+ float invDist = 1.0f / sqrt(distSqr + espSqr);
+ float invDistCube = invDist * invDist * invDist;
+ float s = refPos[index + 3] * invDistCube;
+
+ for(k = 0; k < 3; ++k) {
+ acc[k] += s * r[k];
+ }
+ }
+
+ for(k = 0; k < 3; ++k) {
+ refPos[myIndex + k] += refVel[myIndex + k] * delT + 0.5f * acc[k] * delT * delT;
+ refVel[myIndex + k] += acc[k] * delT;
+ }
+ }
+}
+
+void verify()
+{
+ int i;
+
+ float* refPos = newBuffer(numBodies * sizeof(cl_float4), 0);
+ float* refVel = newBuffer(numBodies * sizeof(cl_float4), 0);
+ memcpy(refPos, initPos, 4 * numBodies * sizeof(cl_float));
+ memcpy(refVel, initVel, 4 * numBodies * sizeof(cl_float));
+
+ for (i = 0; i < ITERATIONS; i++) {
+ nBodyCPU(refPos, refVel);
+ }
+
+ comparef(refPos, pos, numBodies * 4, 1.0e-4f);
+ for (i = 0; i < 20; i++) {
+ printf("%12.8f | %12.8f\n", refPos[i], pos[i]);
+ }
+ printf("Done\n");
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <CL/cl.h>
+#include <assert.h>
+
+#ifndef _WIN32
+#include "CL/cl_intel.h"
+#include "common.h"
+#endif
+
+#define USE_CL_SOURCE
+
+#ifdef USE_CL_SOURCE
+const char *progsrc =
+"__kernel void test1( \n" \
+" __global char* svm, \n" \
+" uint svmBase, \n" \
+" uint context) \n" \
+"{ \n" \
+" int i = get_global_id(0); \n" \
+" //__global int* ptr = (__global int*)(svm+context-svmBase); \n" \
+" //__global int* ptr = (__global int*)&svm[context-svmBase]; \n" \
+" svm -= svmBase; \n" \
+" __global int *ptr = (__global int *)&svm[context]; \n" \
+" ptr[i]=i; \n" \
+"} \n" \
+"\n";
+#else
+static const char* progsrc =
+" SHADER test1 \n"
+" VERSION_2_1 \n"
+" DCL_THREADGROUP VARIABLE; \n"
+" DCL_UAVRAW u0, TRUE = \n"
+" { \n"
+" KERNEL_ARGUMENT,0 \n"
+" }; \n"
+" DCL_CONSTANTREGISTER c0 = \n"
+" { \n"
+" KERNEL_ARGUMENT, 1, 0, \n"
+" KERNEL_ARGUMENT, 2, 0, \n"
+" UNUSED, 0, 0, \n"
+" UNUSED, 0, 0 \n"
+" }; \n"
+" DCL_INPUT i0.xyz, THREAD_ID; \n"
+" DCL_POINTER ptr1; \n"
+" DCL_TEMP r0; \n"
+" DCL_TEMP r1; \n"
+" DCL_POINTER ptr0; \n"
+" ADDRESS_OF ptr0, u0; \n"
+" DCL_TEMP r2; \n"
+" DCL_TEMP r3; \n"
+" DCL_TEMP r4; \n"
+" DCL_POINTER ptr2; \n"
+" DCL_TEMP r5; \n"
+" MOV r2.x, i0.x; \n"
+" PADD ptr1, ptr0,-c0.x; \n"
+" PADD ptr1, ptr1,c0.y;\n"
+" MOV r0.x, r2.x; \n"
+" MOV r5.x, r0.x; \n"
+" MOV r1.x, r5.x; \n"
+" SHL r3.x, r1.x, 2; \n"
+//" MOV r4.x, 0; \n"
+" PADD ptr2, ptr1, r3.x; \n"
+//" UADD r4.x, r4.x, 2; \n"
+//" STORE_RAW_PTR ptr2.x, r4.x; \n"
+" STORE_RAW_PTR ptr2.x, r2.x; \n"
+" RET; \n"
+" END \n"
+;
+#endif
+
+const char *src;
+const char *kernel_name;
+
+#define PAGE_SIZE (4 << 10)
+#define PAGE_ALIGNMENT_MASK (~(PAGE_SIZE-1))
+#define SVM_SIZE (128 << 20)
+
+cl_device_id device_id; // device ID
+cl_context context; // context
+cl_command_queue queue; // command queue
+char* svmBase = NULL;
+cl_mem svmMemObject;
+
+char *load_program_source(const char *filename)
+{
+ FILE *fh;
+ struct stat statbuf;
+ char *source;
+
+ if (!(fh=fopen(filename,"r")))
+ return NULL;
+
+ stat(filename, &statbuf);
+ source = (char *)malloc(statbuf.st_size+1);
+ fread(source, statbuf.st_size,1,fh);
+ source[statbuf.st_size]=0;
+ return source;
+}
+
+int init() {
+ cl_uint num_devices;
+ char device_name[1024];
+ cl_int err;
+ cl_platform_id platform;
+
+ err = clGetPlatformIDs(1, &platform, NULL);
+ if (err != CL_SUCCESS) {
+ fprintf(stderr, "Error: clGetPlatformIDs failed. Error code = %d.\n", err);
+ return err;
+ }
+
+ // Get an ID for the device
+ err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1,
+ &device_id, &num_devices);
+ if (err != CL_SUCCESS)
+ {
+ printf("Error: clGetDeviceIDs");
+ return err;
+ }
+ printf("num_gpu_devices: %d\n",num_devices);
+ err = clGetDeviceInfo(device_id,
+ CL_DEVICE_NAME,
+ 1024,
+ device_name,
+ NULL);
+ if (err != CL_SUCCESS) {
+ printf("Error: clGetDeviceInfo %d\n",err);
+ return err;
+ }
+
+ printf("device name is %s\n",device_name);
+ context = clCreateContext(0, 1, &device_id, NULL, NULL, &err);
+ if (!context)
+ {
+ printf("Error: clCreateContext. error code=%d\n",err);
+ return err;
+ }
+
+ // Create a command queue [5]
+ //
+ queue = clCreateCommandQueue(context, device_id, 0, &err);
+ if (!queue)
+ {
+ printf("Error:clCreateCommandQueue %d",err);
+ return err;
+ }
+#ifdef _WIN32
+ svmBase = malloc(SVM_SIZE+PAGE_SIZE);
+ (cl_uint)svmBase &= PAGE_ALIGNMENT_MASK;
+ svmMemObject = clCreateBuffer(context, CL_MEM_READ_WRITE |
+ CL_MEM_USE_HOST_PTR,
+ (size_t)SVM_SIZE,
+ svmBase,
+ &err);
+ if (err != CL_SUCCESS) {
+ printf("Error: clCreateBuffer of SVM: %d\n",err);
+ return err;
+ }
+#else // Linux
+ svmMemObject = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_PINNABLE,
+ SVM_SIZE, NULL, &err);
+ if (err != CL_SUCCESS) {
+ printf("Error: clCreateBuffer of SVM: %d\n",err);
+ return err;
+ }
+ if ((err = clIntelPinBuffer(svmMemObject)) != CL_SUCCESS) {
+ printf("Error: clIntelPinBuffer: %d\n",err);
+ return err;
+ }
+ svmBase = clIntelMapBuffer(svmMemObject,&err);
+ if (err != CL_SUCCESS) {
+ printf("Error: clIntelMapBuffer: %d\n",err);
+ return err;
+ }
+#endif
+
+ printf("initializing svm with zeros\n");
+ memset(svmBase,0,SVM_SIZE);
+ return err;
+}
+
+int test2() {
+ size_t global; // global domain size for our calculation
+ size_t local; // local domain size for our calculation
+ int err; // error code returned from api calls
+ cl_kernel kernel;
+#ifndef _WIN32
+ /*cl_event event;*/
+#endif
+ int i;
+ char *start;
+ size_t offset;
+ int failed;
+
+#ifdef _WIN32
+ // Create the compute program from the source buffer [6]
+ //
+ cl_program program = clCreateProgramWithSource(context, 1,
+ (const char **) & src, NULL, &err);
+ if (!program)
+ {
+ printf("Error:clCreateProgramWithSource");
+ return 0;
+ }
+ // Build the program executable [7]
+ //
+ err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
+ if (err != CL_SUCCESS)
+ {
+ size_t len;
+ char buffer[2048];
+ //printf("Error: Failed to build program executable\n"); [8]
+ clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG,
+ sizeof(buffer), buffer, &len);
+ printf("%s\n", buffer);
+ exit(1);
+ }
+#else
+ char *ker_path = NULL;
+#endif
+
+ // Create the compute kernel in the program we wish to run [9]
+ //
+#ifdef _WIN32
+ kernel = clCreateKernel(program, kernel_name, &err);
+ if (!kernel || err != CL_SUCCESS)
+ {
+ printf("Error: clCreateKernel");
+ return 0;
+ }
+#else
+ ker_path = do_kiss_path("svm_test_kernel_0.bin", device_id);
+ kernel = getKernelFromBinary(device_id, context, ker_path, "test1");
+#endif
+ // Set the arguments to our compute kernel [12]
+ //
+ //offset = SVM_SIZE >> 1;
+ offset = 0;
+ //offset = 16;
+ //offset = 1;
+ //global = (64 << 20) / sizeof(int);
+ //global = SVM_SIZE / sizeof(int);
+ //global = (SVM_SIZE >> 1) / sizeof(int);
+ global = 1<<10;
+ start = svmBase + offset;
+ err = 0;
+ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &svmMemObject);
+ err |= clSetKernelArg(kernel, 1, sizeof(cl_uint), &svmBase);
+ err |= clSetKernelArg(kernel, 2, sizeof(cl_uint), &start);
+ if (err != CL_SUCCESS)
+ {
+ printf("Error:clSetKernelArg %d",err);
+ return 0;
+ }
+
+#if 0
+ // Get the maximum work-group size for executing the kernel on the device
+ // [13]
+ err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE,
+ sizeof(size_t), &local, NULL);
+ if (err != CL_SUCCESS)
+ {
+ printf("Error:clGetKernelWorkGroupInfo: %d",err);
+ return 0;
+ }
+ printf("workgroup size is %d\n",(int)local);
+#endif
+ local = 16;
+ err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL,
+ &global, &local /*NULL*/,
+ 0, NULL, NULL /*&event*/);
+ if (err != CL_SUCCESS)
+ {
+ printf("Error:clEnqueueNDRangeKernel: %d",err);
+ return 0;
+ }
+
+ clFinish(queue);
+
+ failed = 0;
+ for (i=0;i<(int)global;i++) {
+ if (((int*)start)[i]!=i) {
+ printf("svmBase[%d]=%d.\n",(int) i+(int) offset,(int) start[i]);
+ failed = 1;
+ }
+ }
+
+ printf(failed?"Test failed\n":"Test passed.\n");
+
+ return 0;
+}
+
+
+int main(int argc, char** argv)
+{
+ //src = load_program_source(argv[1]);
+ src = progsrc;
+ //kernel_name = argv[2];
+ kernel_name = "test1";
+ if (init() != CL_SUCCESS)
+ return -1;
+ test2();
+ return 0;
+}
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "cl_test.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+typedef union kernel_arg {
+ float f;
+ uint32_t u32;
+ uint16_t u16;
+ uint8_t u8;
+ int32_t i32;
+ int16_t i16;
+ int8_t i8;
+} kernel_arg_t;
+
+#define W 32
+#define H 32
+
+int
+main (int argc, char *argv[])
+{
+ cl_mem dst0, dst1, src;
+ uint32_t *src_buffer = NULL;
+ IF_DEBUG (uint32_t *dst0_buffer = NULL);
+ IF_DEBUG (uint32_t *dst1_buffer = NULL);
+ const size_t n = W*H;
+ const size_t global_work_size[2] = {W, H};
+ const size_t local_work_size[2] = {8,8};
+ kernel_arg_t arg;
+ int status = 0, i, j;
+
+#if TEST_SIMD8
+ if ((status = cl_test_init("test_2d_copy_kernels_0.bin8", "test_2d_copy")) != 0)
+ goto error;
+#else
+ if ((status = cl_test_init("test_2d_copy_kernels_0.bin", "test_2d_copy")) != 0)
+ goto error;
+#endif
+
+ /* Fill the buffer with random values */
+ if ((src_buffer = malloc(sizeof(uint32_t) * n)) == NULL) {
+ fprintf(stderr, "Allocation failed\n");
+ status = CL_OUT_OF_HOST_MEMORY;
+ goto error;
+ }
+ for (i = 0; i < n; ++i)
+ src_buffer[i] = rand();
+
+ /* Allocate the two buffers */
+ dst0 = clCreateBuffer(ctx, 0, n * sizeof(uint32_t), NULL, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+ dst1 = clCreateBuffer(ctx, 0, n * sizeof(uint32_t), NULL, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+ src = clCreateBuffer(ctx, CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), src_buffer, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+ free(src_buffer);
+
+ /* Set source and destination */
+ arg.i32 = W;
+ CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &dst0);
+ CALL (clSetKernelArg, kernel, 1, sizeof(cl_mem), &dst1);
+ CALL (clSetKernelArg, kernel, 2, sizeof(cl_mem), &src);
+ CALL (clSetKernelArg, kernel, 3, sizeof(float), &arg);
+
+ /* Run the kernel */
+ CALL (clEnqueueNDRangeKernel, queue,
+ kernel,
+ 2,
+ NULL,
+ global_work_size,
+ local_work_size,
+ 0,
+ NULL,
+ NULL);
+
+ /* Be sure that everything run fine */
+#ifndef NDEBUG
+ dst0_buffer = (uint32_t *) clIntelMapBuffer(dst0, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+ dst1_buffer = (uint32_t *) clIntelMapBuffer(dst1, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+#endif /* NDEBUG */
+ src_buffer = (uint32_t *) clIntelMapBuffer(src, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+
+ for (j = 0; j < H; ++j)
+ for (i = 0; i < W; ++i) {
+ assert(dst0_buffer[i+j*W] == src_buffer[i+j*W]);
+ assert(dst1_buffer[i+j*W] == i+j);
+ }
+ IF_DEBUG(CALL (clIntelUnmapBuffer, dst0));
+ IF_DEBUG(CALL (clIntelUnmapBuffer, dst1));
+ CALL (clIntelUnmapBuffer, src);
+ CALL (clReleaseMemObject, dst0);
+ CALL (clReleaseMemObject, dst1);
+ CALL (clReleaseMemObject, src);
+ cl_test_destroy();
+ printf("%i memory leaks\n", clIntelReportUnfreed());
+ assert(clIntelReportUnfreed() == 0);
+
+error:
+ cl_report_error(status);
+ return status;
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "cl_test.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+typedef union kernel_arg {
+ float f;
+ uint32_t u32;
+ uint16_t u16;
+ uint8_t u8;
+ int32_t i32;
+ int16_t i16;
+ int8_t i8;
+} kernel_arg_t;
+
+#define N 1024
+int
+main (int argc, char *argv[])
+{
+ cl_mem dst, src;
+ IF_DEBUG(uint32_t *dst_buffer = NULL);
+ uint32_t *src_buffer = NULL;
+ const size_t n = N;
+ const size_t global_work_size = N;
+ const size_t local_work_size = 256;
+ kernel_arg_t arg;
+ int status = 0, i;
+
+#if TEST_SIMD8
+ if ((status = cl_test_init("test_barrier_kernels_0.bin8", "test_barrier")) != 0)
+ goto error;
+#else
+ if ((status = cl_test_init("test_barrier_kernels_0.bin", "test_barrier")) != 0)
+ goto error;
+#endif
+
+ /* Fill the buffer with random values */
+ if ((src_buffer = malloc(sizeof(uint32_t) * n)) == NULL) {
+ fprintf(stderr, "Allocation failed\n");
+ status = CL_OUT_OF_HOST_MEMORY;
+ goto error;
+ }
+ for (i = 0; i < n; ++i)
+ src_buffer[i] = i;
+
+ /* Allocate the two buffers */
+ dst = clCreateBuffer(ctx, 0, n * sizeof(uint32_t), NULL, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+ src = clCreateBuffer(ctx, CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), src_buffer, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+ free(src_buffer);
+
+ /* Set source and destination */
+ CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &dst);
+ CALL (clSetKernelArg, kernel, 4, sizeof(cl_mem), &src);
+ arg.f = 1.f;
+ CALL (clSetKernelArg, kernel, 1, sizeof(float), &arg);
+ arg.i8 = 2;
+ CALL (clSetKernelArg, kernel, 2, sizeof(int8_t), &arg);
+ arg.i32 = 3;
+ CALL (clSetKernelArg, kernel, 3, sizeof(int32_t), &arg);
+ arg.i16 = 4;
+ CALL (clSetKernelArg, kernel, 5, sizeof(int16_t), &arg);
+ arg.u32 = 5;
+ CALL (clSetKernelArg, kernel, 6, sizeof(uint32_t), &arg);
+ arg.i32 = 6;
+ CALL (clSetKernelArg, kernel, 7, sizeof(int32_t), &arg);
+
+ /* Run the kernel */
+ CALL (clEnqueueNDRangeKernel, queue,
+ kernel,
+ 1,
+ NULL,
+ &global_work_size,
+ &local_work_size,
+ 0,
+ NULL,
+ NULL);
+
+ /* Be sure that everything run fine */
+#ifndef NDEBUG
+ dst_buffer = (uint32_t *) clIntelMapBuffer(dst, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+#endif
+ src_buffer = (uint32_t *) clIntelMapBuffer(src, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+ for (i = 0; i < N; ++i)
+ assert(dst_buffer[i] == src_buffer[i] + 21);
+
+ IF_DEBUG(CALL (clIntelUnmapBuffer, dst));
+ CALL (clIntelUnmapBuffer, src);
+ CALL (clReleaseMemObject, dst);
+ CALL (clReleaseMemObject, src);
+ cl_test_destroy();
+ printf("%i memory leaks\n", clIntelReportUnfreed());
+ assert(clIntelReportUnfreed() == 0);
+
+error:
+ cl_report_error(status);
+ return status;
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "cl_test.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifndef NDEBUG
+static const uint32_t wrk_value[32] = {
+ 71, 69, 67, 65, 63, 61, 59, 57, 55, 53, 51, 49, 47, 45, 43, 41, 39, 37, 35,
+ 33, 31, 29, 27, 25, 23, 21, 19, 17, 15, 13, 11, 9
+};
+#endif /* NDEBUG */
+
+#define N 512
+int
+main (int argc, char *argv[])
+{
+ cl_mem dst, constants;
+ IF_DEBUG(uint32_t *dst_buffer = NULL);
+ uint32_t *cst_buffer = NULL;
+ const size_t n = N;
+ const size_t global_work_size = N;
+ const size_t local_work_size = 32;
+ int status = 0, i;
+
+#if TEST_SIMD8
+ if ((status = cl_test_init("test_constant_memory_0.bin8", "test_constant_memory")) != 0)
+ goto error;
+#else
+ if ((status = cl_test_init("test_constant_memory_0.bin", "test_constant_memory")) != 0)
+ goto error;
+#endif
+
+ /* Allocate the dst buffer */
+ dst = clCreateBuffer(ctx, 0, n * sizeof(uint32_t), NULL, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+
+ /* Allocate the constant buffer and fill it with ones */
+ if ((cst_buffer = (uint32_t*) malloc(sizeof(uint32_t) * 32)) == NULL) {
+ status = CL_OUT_OF_HOST_MEMORY;
+ goto error;
+ }
+ for (i = 0; i < 32; ++i)
+ cst_buffer[i] = 1;
+
+ constants = clCreateBuffer(ctx, CL_MEM_COPY_HOST_PTR,
+ 32 * sizeof(uint32_t),
+ cst_buffer,
+ &status);
+ if (status != CL_SUCCESS)
+ goto error;
+ free(cst_buffer);
+
+ /* Set source and destination */
+ CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &dst);
+ CALL (clSetKernelArg, kernel, 1, 32*sizeof(int), NULL);
+ CALL (clSetKernelArg, kernel, 2, sizeof(cl_mem), &constants);
+
+ /* Run the kernel */
+ CALL (clEnqueueNDRangeKernel, queue,
+ kernel,
+ 1,
+ NULL,
+ &global_work_size,
+ &local_work_size,
+ 0,
+ NULL,
+ NULL);
+
+ /* Be sure that everything run fine */
+#ifndef NDEBUG
+ dst_buffer = (uint32_t *) clIntelMapBuffer(dst, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+ for (i = 0; i < N; ++i)
+ assert(dst_buffer[i] == wrk_value[i % 32]);
+ CALL (clIntelUnmapBuffer, dst);
+#endif /* NDEBUG */
+
+ CALL (clReleaseMemObject, dst);
+ CALL (clReleaseMemObject, constants);
+ cl_test_destroy();
+ printf("%i memory leaks\n", clIntelReportUnfreed());
+ assert(clIntelReportUnfreed() == 0);
+
+error:
+ cl_report_error(status);
+ return status;
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "cl_test.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int
+main (int argc, char *argv[])
+{
+ cl_mem dst, src;
+ IF_DEBUG(uint32_t *dst_buffer = NULL);
+ uint32_t *src_buffer = NULL;
+ const size_t n = 8192 * 4;
+ const size_t global_work_size = n;
+ const size_t local_work_size = 32;
+ int status = 0, i;
+
+ if ((status = cl_test_init("test_copy_buffer.cl", "test_copy_buffer", SOURCE)) != 0)
+ goto error;
+
+ /* Fill the buffer with random values */
+ if ((src_buffer = malloc(sizeof(uint32_t) * n)) == NULL) {
+ fprintf(stderr, "Allocation failed\n");
+ status = CL_OUT_OF_HOST_MEMORY;
+ goto error;
+ }
+ for (i = 0; i < n; ++i)
+ src_buffer[i] = i;
+
+ /* Allocate the two buffers */
+ dst = clCreateBuffer(ctx, 0, n * sizeof(uint32_t), NULL, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+ src = clCreateBuffer(ctx, CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), src_buffer, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+
+ /* Set source and destination */
+ CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &src);
+ CALL (clSetKernelArg, kernel, 1, sizeof(cl_mem), &dst);
+
+ /* Run the kernel */
+ CALL (clEnqueueNDRangeKernel, queue,
+ kernel,
+ 1,
+ NULL,
+ &global_work_size,
+ &local_work_size,
+ 0,
+ NULL,
+ NULL);
+
+#ifndef NDEBUG
+ /* Be sure that everything run fine */
+ dst_buffer = (uint32_t *) clIntelMapBuffer(dst, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+ for (i = 0; i < n; ++i)
+ assert(src_buffer[i] == dst_buffer[i]);
+ CALL (clIntelUnmapBuffer, dst);
+#endif /* NDEBUG */
+ free(src_buffer);
+ CALL (clReleaseMemObject, dst);
+ CALL (clReleaseMemObject, src);
+ cl_test_destroy();
+ printf("%i memory leaks\n", clIntelReportUnfreed());
+ assert(clIntelReportUnfreed() == 0);
+
+error:
+ cl_report_error(status);
+ return status;
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "cl_test.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int
+main (int argc, char *argv[])
+{
+ cl_mem dst, src, data;
+ IF_DEBUG(uint32_t *dst_buffer = NULL);
+ uint32_t *src_buffer = NULL;
+ int *data_buffer = NULL;
+ const int row = 8192;
+ const int row_n = 2;
+ const int n = row * row_n;
+ const size_t global_work_size = row;
+ const size_t local_work_size = 256;
+ int status = 0, i;
+
+ if ((status = cl_test_init("test_copy_buffer_row.cl", "test_copy_buffer_row", SOURCE)) != 0)
+ goto error;
+
+ /* Fill the buffer with some values */
+ src_buffer = (uint32_t *) malloc(sizeof(uint32_t) * n);
+ for (i = 0; i < n; ++i) src_buffer[i] = i;
+
+ /* Just put copy info in a buffer */
+ data_buffer = (int *) malloc(sizeof(int) * 2);
+ data_buffer[0] = row;
+ data_buffer[1] = n;
+
+ /* Allocate the two buffers */
+ dst = clCreateBuffer(ctx, 0, n * sizeof(uint32_t), NULL, &status);
+ if (status != CL_SUCCESS) goto error;
+ src = clCreateBuffer(ctx, CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), src_buffer, &status);
+ if (status != CL_SUCCESS) goto error;
+ data = clCreateBuffer(ctx, CL_MEM_COPY_HOST_PTR, 2 * sizeof(int), data_buffer, &status);
+ if (status != CL_SUCCESS) goto error;
+
+ /* Set source and destination */
+ CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &src);
+ CALL (clSetKernelArg, kernel, 1, sizeof(cl_mem), &dst);
+ CALL (clSetKernelArg, kernel, 2, sizeof(cl_mem), &data);
+
+ /* Run the kernel */
+ CALL (clEnqueueNDRangeKernel, queue,
+ kernel,
+ 1,
+ NULL,
+ &global_work_size,
+ &local_work_size,
+ 0,
+ NULL,
+ NULL);
+
+#ifndef NDEBUG
+ /* Be sure that everything run fine */
+ dst_buffer = (uint32_t *) clIntelMapBuffer(dst, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+ for (i = 0; i < n; ++i)
+ assert(src_buffer[i] == dst_buffer[i]);
+ CALL (clIntelUnmapBuffer, dst);
+#endif /* NDEBUG */
+
+ free(src_buffer);
+ free(data_buffer);
+ CALL (clReleaseMemObject, dst);
+ CALL (clReleaseMemObject, src);
+ CALL (clReleaseMemObject, data);
+ cl_test_destroy();
+ printf("%i memory leaks\n", clIntelReportUnfreed());
+ assert(clIntelReportUnfreed() == 0);
+
+error:
+ cl_report_error(status);
+ return status;
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "cl_test.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int
+main (int argc, char *argv[])
+{
+ cl_mem dst, src;
+ IF_DEBUG(uint32_t *dst_buffer = NULL);
+ uint32_t *src_buffer = NULL;
+ const size_t w = 64, h = 64;
+ const size_t n = w * h;
+ const size_t global_work_size[2] = {w,h};
+ const size_t local_work_size[2] = {16,1};
+ int status = 0, i;
+
+#if TEST_SIMD8
+ if ((status = cl_test_init("test_copy_image_0.bin8", "test_copy_image")) != 0)
+ goto error;
+#else
+ if ((status = cl_test_init("test_copy_image_0.bin", "test_copy_image")) != 0)
+ goto error;
+#endif
+
+ /* Fill the buffer with random values */
+ if ((src_buffer = malloc(sizeof(uint32_t) * n)) == NULL) {
+ fprintf(stderr, "Allocation failed\n");
+ status = CL_OUT_OF_HOST_MEMORY;
+ goto error;
+ }
+ for (i = 0; i < n; ++i)
+ src_buffer[i] = i;
+
+ /* Allocate the two buffers */
+ dst = clCreateBuffer(ctx, 0, n * sizeof(uint32_t), NULL, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+ const cl_image_format fmt = {
+ .image_channel_order = CL_RGBA,
+ .image_channel_data_type = CL_UNSIGNED_INT8
+ };
+ src = clCreateImage2D(ctx,
+ CL_MEM_COPY_HOST_PTR,
+ &fmt,
+ w,
+ h,
+ w * sizeof(uint32_t),
+ src_buffer,
+ &status);
+ if (status != CL_SUCCESS)
+ goto error;
+
+ /* Set source and destination */
+ CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &src);
+ CALL (clSetKernelArg, kernel, 1, sizeof(cl_mem), &dst);
+
+ /* Run the kernel */
+ CALL (clEnqueueNDRangeKernel, queue,
+ kernel,
+ 2,
+ NULL,
+ global_work_size,
+ local_work_size,
+ 0,
+ NULL,
+ NULL);
+
+ /* Be sure that everything run fine */
+#ifndef NDEBUG
+ dst_buffer = (uint32_t *) clIntelMapBuffer(dst, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+ for (i = 0; i < n; ++i) assert(src_buffer[i] == dst_buffer[i]);
+ CALL (clIntelUnmapBuffer, dst);
+#endif /* NDEBUG */
+
+ free(src_buffer);
+ CALL (clReleaseMemObject, dst);
+ CALL (clReleaseMemObject, src);
+ cl_test_destroy();
+ printf("%i memory leaks\n", clIntelReportUnfreed());
+ assert(clIntelReportUnfreed() == 0);
+
+error:
+ cl_report_error(status);
+ return status;
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "cl_test.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int
+main (int argc, char *argv[])
+{
+ cl_mem dst, src;
+ uint32_t *dst_buffer = NULL, *src_buffer = NULL;
+ const size_t n = 16;
+ const size_t global_work_size = n;
+ const size_t local_work_size = 16;
+ int status = 0, i;
+
+ if ((status = cl_test_init("CopyBuffer_0.bin", "CopyBuffer")) != 0)
+ goto error;
+
+ /* Fill the buffer with random values */
+ if ((src_buffer = malloc(sizeof(uint32_t) * n)) == NULL) {
+ fprintf(stderr, "Allocation failed\n");
+ status = CL_OUT_OF_HOST_MEMORY;
+ goto error;
+ }
+ if ((dst_buffer = malloc(sizeof(uint32_t) * n)) == NULL) {
+ fprintf(stderr, "Allocation failed\n");
+ status = CL_OUT_OF_HOST_MEMORY;
+ goto error;
+ }
+ for (i = 0; i < n; ++i)
+ src_buffer[i] = i;
+
+ /* Allocate the two buffers */
+ dst = clCreateBuffer(ctx, CL_MEM_COPY_HOST_PTR | CL_MEM_WRITE_ONLY, n * sizeof(uint32_t), dst_buffer, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+ src = clCreateBuffer(ctx, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY, n * sizeof(uint32_t), src_buffer, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+
+ /* Set source and destination */
+ CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &src);
+ CALL (clSetKernelArg, kernel, 1, sizeof(cl_mem), &dst);
+
+ /* Run the kernel */
+ CALL (clEnqueueNDRangeKernel, queue,
+ kernel,
+ 1,
+ NULL,
+ &global_work_size,
+ &local_work_size,
+ 0,
+ NULL,
+ NULL);
+
+ /* Be sure that everything run fine */
+ CALL (clEnqueueReadBuffer, queue,
+ dst,
+ CL_TRUE,
+ 0,
+ n* sizeof(uint32_t),
+ (void *)dst_buffer,
+ 0,
+ NULL,
+ NULL );
+ if (status != CL_SUCCESS)
+ goto error;
+ for (i = 0; i < n; ++i) {
+ printf("src_buffer[%d]:[%d], dst_buffer[%d]:[%d]\n",
+ i, src_buffer[i], i, dst_buffer[i]);
+// assert(src_buffer[i] == dst_buffer[i]);
+ }
+
+ free(src_buffer);
+ free(dst_buffer);
+ CALL (clReleaseMemObject, dst);
+ CALL (clReleaseMemObject, src);
+ cl_test_destroy();
+ printf("%i memory leaks\n", clIntelReportUnfreed());
+ assert(clIntelReportUnfreed() == 0);
+
+error:
+ cl_report_error(status);
+ return status;
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "cl_test.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int
+main (int argc, char *argv[])
+{
+ const size_t n = 1024;
+ const size_t global_work_size = n;
+ const size_t local_work_size = 16;
+ int status = 0;
+
+ if ((status = cl_test_init("dummy.ll", "hop", LLVM)) != 0) goto error;
+
+ /* Run the kernel */
+ CALL (clEnqueueNDRangeKernel, queue,
+ kernel,
+ 1,
+ NULL,
+ &global_work_size,
+ &local_work_size,
+ 0,
+ NULL,
+ NULL);
+
+ cl_test_destroy();
+ printf("%i memory leaks\n", clIntelReportUnfreed());
+ assert(clIntelReportUnfreed() == 0);
+
+error:
+ cl_report_error(status);
+ return status;
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "cl_test.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int
+main (int argc, char *argv[])
+{
+ cl_mem dst[24];
+ int *dst_buffer;
+ const size_t n = 32 * 1024 * 1024;
+ const size_t global_work_size = n;
+ const size_t local_work_size = 16;
+ int status = 0, i, j;
+
+ if ((status = cl_test_init("test_write_only.cl", "test_write_only", SOURCE)) != 0)
+ goto error;
+
+ for (j = 0; j < 24; ++j) {
+ /* Allocate the two buffers */
+ dst[j] = clCreateBuffer(ctx, 0, n * sizeof(uint32_t), NULL, &status);
+ if (status != CL_SUCCESS) goto error;
+
+ /* Set source and destination */
+ CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &dst[j]);
+
+ /* Run the kernel */
+ CALL (clEnqueueNDRangeKernel, queue,
+ kernel,
+ 1,
+ NULL,
+ &global_work_size,
+ &local_work_size,
+ 0,
+ NULL,
+ NULL);
+
+ /* Be sure that everything run fine */
+ dst_buffer = (int *) clIntelMapBuffer(dst[j], &status);
+ if (status != CL_SUCCESS)
+ goto error;
+ for (i = 0; i < n; ++i) assert(dst_buffer[i] == i);
+ CALL (clIntelUnmapBuffer, dst[j]);
+ }
+
+ for (j = 0; j < 24; ++j) CALL (clReleaseMemObject, dst[j]);
+ cl_test_destroy();
+ printf("%i memory leaks\n", clIntelReportUnfreed());
+ assert(clIntelReportUnfreed() == 0);
+
+error:
+ cl_report_error(status);
+ return status;
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "cl_test.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+typedef union kernel_arg {
+ float f;
+ uint32_t u32;
+ uint16_t u16;
+ uint8_t u8;
+ int32_t i32;
+ int16_t i16;
+ int8_t i8;
+} kernel_arg_t;
+
+int
+main (int argc, char *argv[])
+{
+ cl_mem dst, src;
+ IF_DEBUG(uint32_t *dst_buffer = NULL);
+ uint32_t *src_buffer = NULL;
+ const size_t n = 6*16;
+ const size_t global_work_size = 16;
+ const size_t local_work_size = 16;
+ kernel_arg_t arg;
+ int status = 0, i, j;
+
+#if TEST_SIMD8
+ if ((status = cl_test_init("test_imm_parameters_kernels_0.bin8", "test_imm_parameters")) != 0)
+ goto error;
+#else
+ if ((status = cl_test_init("test_imm_parameters_kernels_0.bin", "test_imm_parameters")) != 0)
+ goto error;
+#endif
+
+ /* Fill the buffer with random values */
+ if ((src_buffer = malloc(sizeof(uint32_t) * n)) == NULL) {
+ fprintf(stderr, "Allocation failed\n");
+ status = CL_OUT_OF_HOST_MEMORY;
+ goto error;
+ }
+ for (i = 0; i < n; ++i)
+ src_buffer[i] = i;
+
+ /* Allocate the two buffers */
+ dst = clCreateBuffer(ctx, 0, n * sizeof(uint32_t), NULL, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+ src = clCreateBuffer(ctx, CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), src_buffer, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+ free(src_buffer);
+
+ /* Set source and destination */
+ CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &dst);
+ CALL (clSetKernelArg, kernel, 4, sizeof(cl_mem), &src);
+ arg.f = 1.f; CALL (clSetKernelArg, kernel, 1, sizeof(float), &arg);
+ arg.i8 = 2; CALL (clSetKernelArg, kernel, 2, sizeof(int8_t), &arg);
+ arg.i32 = 3; CALL (clSetKernelArg, kernel, 3, sizeof(int32_t), &arg);
+ arg.i16 = 4; CALL (clSetKernelArg, kernel, 5, sizeof(int16_t), &arg);
+ arg.u32 = 5; CALL (clSetKernelArg, kernel, 6, sizeof(uint32_t), &arg);
+ arg.i32 = 6; CALL (clSetKernelArg, kernel, 7, sizeof(int32_t), &arg);
+
+ /* Run the kernel */
+ CALL (clEnqueueNDRangeKernel, queue,
+ kernel,
+ 1,
+ NULL,
+ &global_work_size,
+ &local_work_size,
+ 0,
+ NULL,
+ NULL);
+
+ /* Be sure that everything run fine */
+#ifndef NDEBUG
+ dst_buffer = (uint32_t *) clIntelMapBuffer(dst, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+#endif /* NDEBUG */
+
+ src_buffer = (uint32_t *) clIntelMapBuffer(src, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+ for (i = 0; i < 16; ++i)
+ for (j = 0; j < 6; ++j)
+ assert(dst_buffer[6*i+j] == src_buffer[i] + 1 + j);
+
+ IF_DEBUG(CALL (clIntelUnmapBuffer, dst));
+ CALL (clIntelUnmapBuffer, src);
+ CALL (clReleaseMemObject, dst);
+ CALL (clReleaseMemObject, src);
+ cl_test_destroy();
+ printf("%i memory leaks\n", clIntelReportUnfreed());
+ assert(clIntelReportUnfreed() == 0);
+
+error:
+ cl_report_error(status);
+ return status;
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "cl_test.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifndef NDEBUG
+static const uint32_t wrk_value[32] = {
+ 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26,
+ 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0
+};
+#endif /* NDEBUG */
+
+#define N 1024
+int
+main (int argc, char *argv[])
+{
+ cl_mem dst;
+ IF_DEBUG(uint32_t *dst_buffer = NULL);
+ const size_t n = N;
+ const size_t global_work_size = N;
+ const size_t local_work_size = 32;
+ int status = 0;
+#ifndef NDEBUG
+ int i;
+#endif /* NDEBUG */
+
+#if 0 //TEST_SIMD8
+ if ((status = cl_test_init("test_local_memory_0.bin8", "test_local_memory")) != 0)
+ goto error;
+#else
+ if ((status = cl_test_init("test_local_memory_0.bin", "test_local_memory")) != 0)
+ goto error;
+#endif
+
+ /* Allocate the two buffers */
+ dst = clCreateBuffer(ctx, 0, n * sizeof(uint32_t), NULL, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+
+ /* Set source and destination */
+ CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &dst);
+ CALL (clSetKernelArg, kernel, 1, 32*sizeof(int), NULL);
+ CALL (clSetKernelArg, kernel, 2, 32*sizeof(int), NULL);
+
+ /* Run the kernel */
+ CALL (clEnqueueNDRangeKernel, queue,
+ kernel,
+ 1,
+ NULL,
+ &global_work_size,
+ &local_work_size,
+ 0,
+ NULL,
+ NULL);
+
+ /* Be sure that everything run fine */
+#ifndef NDEBUG
+ dst_buffer = (uint32_t *) clIntelMapBuffer(dst, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+ for (i = 0; i < N; ++i)
+ assert(dst_buffer[i] == wrk_value[i % 32]);
+ CALL (clIntelUnmapBuffer, dst);
+#endif /* NDEBUG */
+
+ CALL (clReleaseMemObject, dst);
+ cl_test_destroy();
+ printf("%i memory leaks\n", clIntelReportUnfreed());
+ assert(clIntelReportUnfreed() == 0);
+
+error:
+ cl_report_error(status);
+ return status;
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "cl_test.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int
+main (int argc, char *argv[])
+{
+ cl_mem dst, src;
+ uint32_t *src_buffer = NULL;
+ const size_t n = 1024;
+ //const size_t global_work_size = 1024;
+ //const size_t local_work_size = 16;
+ int status = 0, i;
+
+ if ((status = cl_test_init("CopyBuffer_0.bin", "CopyBuffer")) != 0)
+ goto error;
+
+ /* Fill the buffer with random values */
+ if ((src_buffer = malloc(sizeof(uint32_t) * n)) == NULL) {
+ fprintf(stderr, "Allocation failed\n");
+ status = CL_OUT_OF_HOST_MEMORY;
+ goto error;
+ }
+ for (i = 0; i < n; ++i)
+ src_buffer[i] = rand();
+
+ /* Allocate the two buffers */
+ dst = clCreateBuffer(ctx, 0, n * sizeof(uint32_t), NULL, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+ src = clCreateBuffer(ctx, CL_MEM_COPY_HOST_PTR, n * sizeof(uint32_t), src_buffer, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+ free(src_buffer);
+
+ /* Set source and destination */
+ CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &src);
+ CALL (clSetKernelArg, kernel, 1, sizeof(cl_mem), &dst);
+
+ /* Release the buffers we allocate here */
+ cl_test_destroy();
+ CALL (clReleaseMemObject, dst);
+ CALL (clReleaseMemObject, src);
+ printf("%i memory leaks\n", clIntelReportUnfreed());
+
+error:
+ cl_report_error(status);
+ return status;
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "cl_test.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifndef NDEBUG
+static const uint32_t wrk_value[32] = {
+ 64, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26,
+ 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0
+};
+#endif /* NDEBUG */
+
+#define N 16
+int
+main (int argc, char *argv[])
+{
+ cl_mem dst;
+ uint32_t *dst_buffer = NULL;
+ const size_t n = N;
+ const size_t global_work_size = N;
+ const size_t local_work_size = 16;
+ int status = 0;
+
+#if TEST_SIMD8
+ if ((status = cl_test_init("test_private_memory_0.bin8", "test_private_memory")) != 0)
+ goto error;
+#else
+ if ((status = cl_test_init("test_private_memory_0.bin", "test_private_memory")) != 0)
+ goto error;
+#endif
+
+ dst = clCreateBuffer(ctx, 0, n * sizeof(uint32_t), NULL, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+
+ /* Set source and destination */
+ CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &dst);
+
+ /* Run the kernel */
+ CALL (clEnqueueNDRangeKernel, queue,
+ kernel,
+ 1,
+ NULL,
+ &global_work_size,
+ &local_work_size,
+ 0,
+ NULL,
+ NULL);
+
+ /* Be sure that everything run fine */
+ dst_buffer = (uint32_t *) clIntelMapBuffer(dst, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+ int i;
+ for (i = 0; i < N; ++i)
+ //assert(dst_buffer[i] == wrk_value[i % 32]);
+ //printf("[%u %u]", (dst_buffer[i] >> 8)& 0xf, dst_buffer[i] & 0x3);
+ printf("[%u]", dst_buffer[i]);
+ assert(dst_buffer[0] == 496);
+ assert(dst_buffer[1] == 528);
+ assert(dst_buffer[2] == 560);
+ assert(dst_buffer[3] == 592);
+ assert(dst_buffer[4] == 624);
+ assert(dst_buffer[5] == 656);
+ assert(dst_buffer[6] == 688);
+ assert(dst_buffer[7] == 720);
+ assert(dst_buffer[8] == 752);
+ assert(dst_buffer[9] == 784);
+ assert(dst_buffer[10] == 816);
+ assert(dst_buffer[11] == 848);
+ assert(dst_buffer[12] == 880);
+ assert(dst_buffer[13] == 912);
+ assert(dst_buffer[14] == 944);
+ assert(dst_buffer[15] == 976);
+
+ CALL (clIntelUnmapBuffer, dst);
+ CALL (clReleaseMemObject, dst);
+ cl_test_destroy();
+ printf("%i memory leaks\n", clIntelReportUnfreed());
+ assert(clIntelReportUnfreed() == 0);
+
+error:
+ cl_report_error(status);
+ return status;
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "cl_test.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifndef NDEBUG
+static const uint32_t wrk_value[32] = {
+ 62, 60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30, 28, 26,
+ 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0
+};
+#endif /* NDEBUG */
+
+#define N 544
+int
+main (int argc, char *argv[])
+{
+ cl_mem dst;
+ IF_DEBUG(uint32_t *dst_buffer = NULL);
+ const size_t n = N;
+ const size_t global_work_size = N;
+ const size_t local_work_size = 32;
+ int status = 0;
+ IF_DEBUG(int i);
+
+#if TEST_SIMD8
+ if ((status = cl_test_init("test_static_local_memory_0.bin8", "test_static_local_memory")) != 0)
+ goto error;
+#else
+ if ((status = cl_test_init("test_static_local_memory_0.bin", "test_static_local_memory")) != 0)
+ goto error;
+#endif
+
+ /* Allocate the two buffers */
+ dst = clCreateBuffer(ctx, 0, n * sizeof(uint32_t), NULL, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+
+ /* Set source and destination */
+ CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &dst);
+
+ /* Run the kernel */
+ CALL (clEnqueueNDRangeKernel, queue,
+ kernel,
+ 1,
+ NULL,
+ &global_work_size,
+ &local_work_size,
+ 0,
+ NULL,
+ NULL);
+
+ /* Be sure that everything run fine */
+#ifndef NDEBUG
+ dst_buffer = (uint32_t *) clIntelMapBuffer(dst, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+ for (i = 0; i < N; ++i)
+ assert(dst_buffer[i] == wrk_value[i % 32]);
+ CALL (clIntelUnmapBuffer, dst);
+#endif /* NDEBUG */
+
+ CALL (clReleaseMemObject, dst);
+ cl_test_destroy();
+ printf("%i memory leaks\n", clIntelReportUnfreed());
+ assert(clIntelReportUnfreed() == 0);
+
+error:
+ cl_report_error(status);
+ return status;
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "cl_test.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+int
+main (int argc, char *argv[])
+{
+ cl_mem dst;
+ int *dst_buffer = NULL;
+ //const size_t n = 1024 * 1024;
+ const size_t n = 2048;
+ const size_t global_work_size = n;
+ const size_t local_work_size = 16;
+ int status = 0, i;
+
+ if ((status = cl_test_init("test_write_only.cl", "test_write_only", SOURCE)) != 0)
+ goto error;
+
+ /* Allocate the two buffers */
+ dst = clCreateBuffer(ctx, 0, n * sizeof(uint32_t), NULL, &status);
+ if (status != CL_SUCCESS) goto error;
+
+ /* Set source and destination */
+ CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &dst);
+
+ /* Run the kernel */
+ CALL (clEnqueueNDRangeKernel, queue,
+ kernel,
+ 1,
+ NULL,
+ &global_work_size,
+ &local_work_size,
+ 0,
+ NULL,
+ NULL);
+
+ /* Be sure that everything run fine */
+ dst_buffer = (int *) clIntelMapBuffer(dst, &status);
+ if (status != CL_SUCCESS)
+ goto error;
+ for (i = 0; i < n; ++i) assert(dst_buffer[i] == i);
+ CALL (clIntelUnmapBuffer, dst);
+ CALL (clReleaseMemObject, dst);
+ cl_test_destroy();
+ printf("%i memory leaks\n", clIntelReportUnfreed());
+ assert(clIntelReportUnfreed() == 0);
+
+error:
+ cl_report_error(status);
+ return status;
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "common.h"
+void verify();
+float *input, *output, *refout;
+int width = 128;
+int height = 128;
+int blockSize = 16;
+
+int main(int argc, char**argv)
+{
+ struct args args = {0};
+ int err, i;
+
+ parseArgs(argc, argv, &args);
+
+ cl_device_id device = getDeviceID(args.d);
+ cl_context context = clCreateContext(0, 1, &device, NULL, NULL, &err); CHK_ERR(err);
+ cl_command_queue queue = clCreateCommandQueue(context, device, 0, &err); CHK_ERR(err);
+ cl_kernel kernel = getKernel(device, context, "transpose_kernels.cl", "matrixTranspose");
+
+ input = newBuffer(width * height * sizeof(cl_float), 'p');
+ output = newBuffer(width * height * sizeof(cl_float), '0');
+ refout = newBuffer(width * height * sizeof(cl_float), '0');
+
+
+ cl_mem inputBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
+ sizeof(cl_float) * width * height, input, &err); CHK_ERR(err);
+ cl_mem outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR,
+ sizeof(cl_float) * width * height, output, &err); CHK_ERR(err);
+
+ /* Execute */
+ size_t globalThreads[2] = {width, height};
+ size_t localThreads [2] = {blockSize, blockSize};
+
+
+ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &outputBuffer); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &inputBuffer); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 2, sizeof(cl_float)*blockSize*blockSize, NULL); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 3, sizeof(cl_int), &width); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 4, sizeof(cl_int), &height); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 5, sizeof(cl_int), &blockSize); CHK_ERR(err);
+
+ cl_event eND;
+ err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalThreads, localThreads,
+ 0, NULL, &eND);
+ CHK_ERR(err);
+
+ err = clEnqueueReadBuffer(queue, outputBuffer, CL_TRUE, 0, width * height * sizeof(cl_float), output,
+ 0, NULL, &eND);
+ CHK_ERR(err);
+
+ verify();
+}
+void transposeCPU(float *output, float *input, int width, int height)
+{
+ int i, j;
+
+ for(j=0; j < height; j++)
+ for(i=0; i < width; i++)
+ output[i*height + j] = input[j*width + i];
+}
+
+
+void verify()
+{
+ int i;
+ transposeCPU(refout, input, width, height);
+ comparef(refout, output, width * height, 1e-5f);
+ for (i=0; i<20; i++) {
+ printf("%12.8f %12.8f\n", refout[i], input[i]);
+ }
+ printf("Done\n");
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "common.h"
+void verify();
+static cl_uchar4 *inputImageData, *outputImageData, *refOut;
+static int *pixelData;
+static int width;
+static int height;
+static int pixelSize = sizeof(cl_uchar4);
+static int factor = 25.0;
+static int blockSizeX = 64;
+static int blockSizeY = 1;
+
+int main(int argc, char**argv)
+{
+ struct args args = {0};
+ int err;
+ char *ker_path = NULL;
+ char *input_path = NULL;
+ char *output_path = NULL;
+ parseArgs(argc, argv, &args);
+
+ cl_device_id device = getDeviceID(args.d);
+ ker_path = do_kiss_path("urng_kernels_0.bin", device);
+ input_path = do_kiss_path("urng_input.bmp", NULL);
+ output_path = do_kiss_path("urng_output.bmp", NULL);
+ cl_context context = clCreateContext(0, 1, &device, NULL, NULL, &err); CHK_ERR(err);
+ cl_command_queue queue = clCreateCommandQueue(context, device, 0, &err); CHK_ERR(err);
+ cl_kernel kernel = getKernelFromBinary(device, context, ker_path, "noise_uniform");
+
+ pixelData = readBmp(input_path, &width, &height);
+ inputImageData = (cl_uchar4*) newBuffer(width * height * pixelSize, 0);
+ outputImageData = (cl_uchar4*) newBuffer(width * height * pixelSize, 0);
+ refOut = (cl_uchar4*) newBuffer(width * height * pixelSize, 0);
+ memcpy(inputImageData, pixelData, width * height * pixelSize);
+
+ cl_mem inputImageBuffer = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, width * height * pixelSize, inputImageData, &err); CHK_ERR(err);
+ cl_mem outputImageBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, width * height * pixelSize,
+ outputImageData, &err); CHK_ERR(err);
+
+ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImageBuffer); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputImageBuffer); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 2, sizeof(factor), &factor); CHK_ERR(err);
+
+ size_t globalThreads[] = {width, height};
+ size_t localThreads[] = {blockSizeX, blockSizeY};
+
+ err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL);
+ CHK_ERR(err);
+ outputImageData = clIntelMapBuffer(outputImageBuffer, &err);
+ CHK_ERR(err);
+
+ writeBmp((int *) outputImageData, width, height, output_path);
+ verify();
+ clIntelUnmapBuffer(outputImageBuffer);
+ return err;
+}
+
+void verify()
+{
+ int i;
+ // comparef(d, c, MAX, 1.0e-6);
+ float mean = 0.0f;
+ for(i = 0; i < (int)(width * height); i++) {
+ mean += outputImageData[i].s[0] - inputImageData[i].s[0];
+ }
+ mean /= (width * height * factor);
+
+ if(fabs(mean) < 1.0) {
+ printf("%12.8f Passed!\n", mean);
+ } else {
+ printf("%12.8f Failed!\n", mean);
+ }
+ printf("Done\n");
+}
+
--- /dev/null
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Benjamin Segovia <benjamin.segovia@intel.com>
+ */
+
+#include "common.h"
+void verify();
+
+#define MAX 1000000
+float *a, *b, *c;
+
+int main(int argc, char**argv)
+{
+ struct args args = {0};
+ int err, i;
+
+ parseArgs(argc, argv, &args);
+
+ cl_device_id device = getDeviceID(args.d);
+ cl_context context = clCreateContext(0, 1, &device, NULL, NULL, &err); CHK_ERR(err);
+ cl_command_queue queue = clCreateCommandQueue(context, device, 0, &err); CHK_ERR(err);
+ cl_kernel kernel = getKernel(device, context, "vadd_kernel.cl", "vadd_gpu");
+
+ a = newBuffer(MAX * sizeof(float), 'f');
+ b = newBuffer(MAX * sizeof(float), 'f');
+ c = newBuffer(MAX * sizeof(float), '0');
+
+ cl_mem aBuffer = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, MAX * sizeof(float), a, &err); CHK_ERR(err);
+ cl_mem bBuffer = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, MAX * sizeof(float), b, &err); CHK_ERR(err);
+ cl_mem cBuffer = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, MAX * sizeof(float), c, &err); CHK_ERR(err);
+
+ /* Execute */
+ int gws = MAX;
+ int lws = 16;
+
+ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &aBuffer); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bBuffer); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &cBuffer); CHK_ERR(err);
+ err = clSetKernelArg(kernel, 3, sizeof(size_t), &gws); CHK_ERR(err);
+
+ cl_event eND;
+ err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &gws, &lws, 0, NULL, &eND); CHK_ERR(err);
+ err = clEnqueueReadBuffer(queue, cBuffer, CL_TRUE, 0, MAX*sizeof(float), c, 1, &eND, NULL); CHK_ERR(err);
+
+ verify();
+}
+
+void verify()
+{
+ int i;
+ float *d;
+
+ d = newBuffer(MAX * sizeof(float), '0');
+ for (i = 0; i < MAX; i++) {
+ d[i] = a[i] + b[i];
+ }
+ comparef(d, c, MAX, 1.0e-6);
+ printf("Done\n");
+}
+