2 * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
17 /*******************************************************************************
18 * Copyright (c) 2008-2015 The Khronos Group Inc.
20 * Permission is hereby granted, free of charge, to any person obtaining a
21 * copy of this software and/or associated documentation files (the
22 * "Materials"), to deal in the Materials without restriction, including
23 * without limitation the rights to use, copy, modify, merge, publish,
24 * distribute, sublicense, and/or sell copies of the Materials, and to
25 * permit persons to whom the Materials are furnished to do so, subject to
26 * the following conditions:
28 * The above copyright notice and this permission notice shall be included
29 * in all copies or substantial portions of the Materials.
31 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
32 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
33 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
34 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
35 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
36 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
37 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
38 ******************************************************************************/
40 #include "arm_compute/core/CL/OpenCL.h"
45 void printDeviceInfo(int n, cl::Device &device, cl::Device &default_device)
47 bool is_default = (device() == default_device());
48 std::cout << "\t\t\t#" << n << " Device: (id: " << device() << ") "
49 << (is_default ? " -> default" : "") << "\n";
51 const auto name = device.getInfo<CL_DEVICE_NAME>();
52 std::cout << "\t\t\t\tName: " << name << "\n";
54 const auto compute_unit = device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
55 std::cout << "\t\t\t\tMax Compute Unit: " << compute_unit << "\n";
57 const auto max_work_item_size = device.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();
58 std::cout << "\t\t\t\tMax Work Item Size: [";
59 for (auto size : max_work_item_size)
60 std::cout << size << ",";
63 const auto max_work_group_size = device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
64 std::cout << "\t\t\t\tMax Work Grpup Size: " << max_work_group_size << "\n";
66 const auto max_clock_frequency = device.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>();
67 std::cout << "\t\t\t\tMax Clock Frequency: " << max_clock_frequency << "\n";
75 cl::Platform platform_;
77 cl::vector<cl::Device> devices_;
78 std::vector<cl::CommandQueue *> q_;
85 platform_ = cl::Platform::getDefault();
89 cl_context_properties properties[3] = {CL_CONTEXT_PLATFORM,
90 (cl_context_properties)platform_(), 0};
92 context_ = cl::Context(CL_DEVICE_TYPE_GPU, properties, NULL, NULL, &cl_error);
94 catch (cl::Error &err) // thrown when there is no Context for this platform
96 std::cout << "\t\t No Context Found\n";
100 devices_ = context_.getInfo<CL_CONTEXT_DEVICES>();
102 for (int dev_id = 0; dev_id < devices_.size(); dev_id++)
104 cl::CommandQueue *que = new cl::CommandQueue(context_, devices_[dev_id]);
105 q_.emplace_back(que);
111 for (auto each_q : q_)
115 void buildProgram(std::string &kernel_source_code)
117 std::vector<std::string> programStrings{kernel_source_code};
119 program_ = cl::Program(context_, programStrings);
123 program_.build("-cl-std=CL1.2");
125 catch (cl::Error &err)
127 cl_int buildErr = CL_SUCCESS;
128 auto buildInfo = program_.getBuildInfo<CL_PROGRAM_BUILD_LOG>(&buildErr);
129 for (auto &pair : buildInfo)
131 std::cerr << pair.second << std::endl << std::endl;
137 void checkContextMem()
141 // get context, devices
143 std::cout << "\nChecking if devices in GPU shares the same memory address:\n\n";
147 std::cout << "\nDevices in GPU:\n\n";
149 auto &devices = gpu.devices_;
150 auto default_device = cl::Device::getDefault();
153 for (auto device : devices)
154 printDeviceInfo(++d, device, default_device);
158 std::cout << "\t\t This options works when there are n (>= 2) devices.\n";
162 // allocate and map memory
165 const int items_per_device = 128;
166 const int length = items_per_device * devices.size();
168 std::vector<T> input(length);
169 std::vector<T> output(length, 0);
171 for (int i = 0; i < length; i++)
174 cl::Buffer input_buf(gpu.context_, (cl_mem_flags)CL_MEM_USE_HOST_PTR, length * sizeof(T),
175 input.data(), &cl_error);
176 cl::Buffer output_buf(gpu.context_, (cl_mem_flags)CL_MEM_USE_HOST_PTR, length * sizeof(T),
177 output.data(), &cl_error);
179 // compile test cl code
181 std::string kernel_source{"typedef int T; \n"
182 "kernel void memory_test( \n"
183 " const int dev_id, \n"
184 " global T* input, \n"
185 " global T* output, \n"
186 " const int start_idx, \n"
187 " const int count) \n"
189 " int input_idx = get_global_id(0); \n"
190 " if(input_idx < count) \n"
192 " int output_idx = start_idx + input_idx; \n"
193 " output[output_idx] = input[input_idx] + dev_id; \n"
197 gpu.buildProgram(kernel_source);
201 auto kernel_functor = cl::KernelFunctor<cl_int, cl::Buffer, cl::Buffer, cl_int, cl_int>(
202 gpu.program_, "memory_test"); // name should be same as cl function name
204 // create a queue per device and queue a kernel job
206 for (int dev_id = 0; dev_id < devices.size(); dev_id++)
208 kernel_functor(cl::EnqueueArgs(*(gpu.q_[dev_id]), cl::NDRange(items_per_device)),
209 (cl_int)dev_id, // dev id
210 input_buf, output_buf,
211 (cl_int)(items_per_device * dev_id), // start index
212 (cl_int)(items_per_device), // count
218 for (d = 0; d < devices.size(); d++)
219 (gpu.q_[d])->finish();
221 // check if memory state changed by all devices
223 cl::copy(*(gpu.q_[0]), output_buf, begin(output), end(output));
225 bool use_same_memory = true;
227 for (int dev_id = 0; dev_id < devices.size(); dev_id++)
229 for (int i = 0; i < items_per_device; ++i)
231 int output_idx = items_per_device * dev_id + i;
232 if (output[output_idx] != input[i] + dev_id)
234 std::cout << "Output[" << output_idx << "] : "
235 << "expected = " << input[i] + dev_id << "; actual = " << output[output_idx]
237 use_same_memory = false;
244 std::cout << "\n=> Mapped memory addresses used by devices in GPU are same.\n\n";
246 std::cout << "\n=> Mapped memory addresses used by devices in GPU are different.\n\n";
248 catch (cl::Error &err)
250 std::cerr << "error: code: " << err.err() << ", what: " << err.what() << std::endl;
256 std::cout << "opencl information: \n\n";
257 std::cout << "\t -h : help\n";
259 << "\t -g : print if memory map is shared among devices in GPU (in default platform)\n\n";
260 std::cout << "\t -s : test for synchronized work by two devices in a GPU\n\n";
266 #include <condition_variable>
268 #define MAX_DEVICE_NUM 8 // just for testing
270 int kernel_idx[MAX_DEVICE_NUM];
271 unsigned char kernel_completed = 0x00; // bit 0 = 1 means kernel by device[0] was completed.
273 kernel_completed_flag; // if comparing kernel_completed with this var, all kernels are completed
275 std::mutex kernel_complete_handler_mutex;
277 std::condition_variable wakeup_main;
278 std::mutex wakeup_main_mutex;
280 void notifyKernelFinished(cl_event ev, cl_int ev_info, void *device_idx)
282 std::cout << "callback from device[" << *((int *)device_idx) << "] : ==> completed.\n";
284 std::unique_lock<std::mutex> lock(kernel_complete_handler_mutex);
286 kernel_completed |= 0x01 << *((int *)device_idx);
287 if (kernel_completed == kernel_completed_flag)
288 wakeup_main.notify_one();
297 const int items_per_device = 1024 * 768;
298 const int length = items_per_device * gpu.devices_.size();
300 std::vector<T> output(length, 0);
302 cl::Buffer output_buf(gpu.context_, (cl_mem_flags)CL_MEM_USE_HOST_PTR, length * sizeof(T),
303 output.data(), &cl_error);
305 std::string kernel_source{"kernel void test(global float* output, const int count) \n"
307 " int idx = get_global_id(0); \n"
308 " if(idx < count) \n"
310 " float x = hypot(idx/1.111, idx*1.111); \n"
311 " for (int y = 0; y < 200; y++) \n"
312 " x = rootn(log(pown(rootn(log(pown(x, 20)), 5), 20)), 5); \n"
313 " output[idx] = x; \n"
317 gpu.buildProgram(kernel_source);
321 auto kernel_functor = cl::KernelFunctor<cl::Buffer, cl_int>(
322 gpu.program_, "test"); // name should be same as cl function name
325 cl::Event ev[MAX_DEVICE_NUM];
327 device_num = gpu.devices_.size();
329 kernel_completed = 0;
330 kernel_completed_flag = 0;
331 for (int i = 0; i < device_num; i++)
334 kernel_completed_flag |= 0x01 << i;
337 // create a queue per device and queue a kernel job
338 // queueing with callback function
339 for (int dev_id = 0; dev_id < gpu.devices_.size(); dev_id++)
341 ev[dev_id] = kernel_functor(cl::EnqueueArgs(*(gpu.q_[dev_id]), cl::NDRange(items_per_device)),
343 (cl_int)(items_per_device), // count
345 ev[dev_id].setCallback(CL_COMPLETE, notifyKernelFinished, (void *)(kernel_idx + dev_id));
347 // how to check kernel execution status
349 // auto status = ev[dev_id].getInfo<CL_EVENT_COMMAND_EXECUTION_STATUS>();
350 // std::cout << "Event status = " << (status == CL_QUEUED ? "CL_QUEUED" : status ==
351 // CL_SUBMITTED ? "CL_SUBMITTED" : status == CL_COMPLETE ? "CL_COMPLETE" : "unknown")
353 // std::cout << "Event status code = " << status << std::endl;
356 // long wait until kernels are over
358 std::unique_lock<std::mutex> lk(wakeup_main_mutex);
359 wakeup_main.wait(lk, [] { return (kernel_completed == kernel_completed_flag); });
361 std::cout << "all devices were completed.\n";
364 catch (cl::Error &err)
366 std::cerr << "error: code: " << err.err() << ", what: " << err.what() << std::endl;
370 int main(const int argc, char **argv)
376 std::string option = argv[1];
378 if (option == "-h") // help
380 else if (option == "-g") // check if devices in GPU uses same memory address
382 else if (option == "-s") // check synchronization between devices in GPU