runtime/contrib/labs/opencl_test/src/opencl_test.cc

   1 /*
   2  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *    http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /*******************************************************************************
  18  * Copyright (c) 2008-2015 The Khronos Group Inc.
  19  *
  20  * Permission is hereby granted, free of charge, to any person obtaining a
  21  * copy of this software and/or associated documentation files (the
  22  * "Materials"), to deal in the Materials without restriction, including
  23  * without limitation the rights to use, copy, modify, merge, publish,
  24  * distribute, sublicense, and/or sell copies of the Materials, and to
  25  * permit persons to whom the Materials are furnished to do so, subject to
  26  * the following conditions:
  27  *
  28  * The above copyright notice and this permission notice shall be included
  29  * in all copies or substantial portions of the Materials.
  30  *
  31  * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  32  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  33  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  34  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
  35  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  36  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  37  * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
  38  ******************************************************************************/
  39
  40 #include "arm_compute/core/CL/OpenCL.h"
  41
  42 #include <iostream>
  43 #include <vector>
  44
  45 void printDeviceInfo(int n, cl::Device &device, cl::Device &default_device)
  46 {
  47   bool is_default = (device() == default_device());
  48   std::cout << "\t\t\t#" << n << " Device: (id: " << device() << ") "
  49             << (is_default ? " -> default" : "") << "\n";
  50
  51   const auto name = device.getInfo<CL_DEVICE_NAME>();
  52   std::cout << "\t\t\t\tName: " << name << "\n";
  53
  54   const auto compute_unit = device.getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
  55   std::cout << "\t\t\t\tMax Compute Unit: " << compute_unit << "\n";
  56
  57   const auto max_work_item_size = device.getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();
  58   std::cout << "\t\t\t\tMax Work Item Size: [";
  59   for (auto size : max_work_item_size)
  60     std::cout << size << ",";
  61   std::cout << "]\n";
  62
  63   const auto max_work_group_size = device.getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
  64   std::cout << "\t\t\t\tMax Work Grpup Size: " << max_work_group_size << "\n";
  65
  66   const auto max_clock_frequency = device.getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>();
  67   std::cout << "\t\t\t\tMax Clock Frequency: " << max_clock_frequency << "\n";
  68
  69   std::cout << "\n";
  70 }
  71
  72 class OpenCLGpu
  73 {
  74 public:
  75   cl::Platform platform_;
  76   cl::Context context_;
  77   cl::vector<cl::Device> devices_;
  78   std::vector<cl::CommandQueue *> q_;
  79   cl::Program program_;
  80
  81   OpenCLGpu()
  82   {
  83     cl_int cl_error;
  84
  85     platform_ = cl::Platform::getDefault();
  86
  87     try
  88     {
  89       cl_context_properties properties[3] = {CL_CONTEXT_PLATFORM,
  90                                              (cl_context_properties)platform_(), 0};
  91
  92       context_ = cl::Context(CL_DEVICE_TYPE_GPU, properties, NULL, NULL, &cl_error);
  93     }
  94     catch (cl::Error &err) // thrown when there is no Context for this platform
  95     {
  96       std::cout << "\t\t No Context Found\n";
  97       return;
  98     }
  99
 100     devices_ = context_.getInfo<CL_CONTEXT_DEVICES>();
 101
 102     for (int dev_id = 0; dev_id < devices_.size(); dev_id++)
 103     {
 104       cl::CommandQueue *que = new cl::CommandQueue(context_, devices_[dev_id]);
 105       q_.emplace_back(que);
 106     }
 107   }
 108
 109   ~OpenCLGpu()
 110   {
 111     for (auto each_q : q_)
 112       delete each_q;
 113   }
 114
 115   void buildProgram(std::string &kernel_source_code)
 116   {
 117     std::vector<std::string> programStrings{kernel_source_code};
 118
 119     program_ = cl::Program(context_, programStrings);
 120
 121     try
 122     {
 123       program_.build("-cl-std=CL1.2");
 124     }
 125     catch (cl::Error &err)
 126     {
 127       cl_int buildErr = CL_SUCCESS;
 128       auto buildInfo = program_.getBuildInfo<CL_PROGRAM_BUILD_LOG>(&buildErr);
 129       for (auto &pair : buildInfo)
 130       {
 131         std::cerr << pair.second << std::endl << std::endl;
 132       }
 133     }
 134   }
 135 };
 136
 137 void checkContextMem()
 138 {
 139   cl_int cl_error;
 140
 141   // get context, devices
 142   //
 143   std::cout << "\nChecking if devices in GPU shares the same memory address:\n\n";
 144
 145   OpenCLGpu gpu;
 146
 147   std::cout << "\nDevices in GPU:\n\n";
 148
 149   auto &devices = gpu.devices_;
 150   auto default_device = cl::Device::getDefault();
 151
 152   int d = 0;
 153   for (auto device : devices)
 154     printDeviceInfo(++d, device, default_device);
 155
 156   if (d < 2)
 157   {
 158     std::cout << "\t\t This options works when there are n (>= 2) devices.\n";
 159     return;
 160   }
 161
 162   // allocate and map memory
 163
 164   typedef cl_int T;
 165   const int items_per_device = 128;
 166   const int length = items_per_device * devices.size();
 167
 168   std::vector<T> input(length);
 169   std::vector<T> output(length, 0);
 170
 171   for (int i = 0; i < length; i++)
 172     input[i] = i;
 173
 174   cl::Buffer input_buf(gpu.context_, (cl_mem_flags)CL_MEM_USE_HOST_PTR, length * sizeof(T),
 175                        input.data(), &cl_error);
 176   cl::Buffer output_buf(gpu.context_, (cl_mem_flags)CL_MEM_USE_HOST_PTR, length * sizeof(T),
 177                         output.data(), &cl_error);
 178
 179   // compile test cl code
 180
 181   std::string kernel_source{"typedef int T;                                                 \n"
 182                             "kernel void memory_test(                                       \n"
 183                             "   const int dev_id,                                           \n"
 184                             "   global T* input,                                            \n"
 185                             "   global T* output,                                           \n"
 186                             "   const int start_idx,                                        \n"
 187                             "   const int count)                                            \n"
 188                             "{                                                              \n"
 189                             "   int input_idx = get_global_id(0);                           \n"
 190                             "   if(input_idx < count)                                       \n"
 191                             "   {                                                           \n"
 192                             "       int output_idx = start_idx + input_idx;                 \n"
 193                             "       output[output_idx] = input[input_idx] + dev_id;         \n"
 194                             "   }                                                           \n"
 195                             "}                                                              \n"};
 196
 197   gpu.buildProgram(kernel_source);
 198
 199   try
 200   {
 201     auto kernel_functor = cl::KernelFunctor<cl_int, cl::Buffer, cl::Buffer, cl_int, cl_int>(
 202         gpu.program_, "memory_test"); // name should be same as cl function name
 203
 204     // create a queue per device and queue a kernel job
 205
 206     for (int dev_id = 0; dev_id < devices.size(); dev_id++)
 207     {
 208       kernel_functor(cl::EnqueueArgs(*(gpu.q_[dev_id]), cl::NDRange(items_per_device)),
 209                      (cl_int)dev_id, // dev id
 210                      input_buf, output_buf,
 211                      (cl_int)(items_per_device * dev_id), // start index
 212                      (cl_int)(items_per_device),          // count
 213                      cl_error);
 214     }
 215
 216     // sync
 217
 218     for (d = 0; d < devices.size(); d++)
 219       (gpu.q_[d])->finish();
 220
 221     // check if memory state changed by all devices
 222
 223     cl::copy(*(gpu.q_[0]), output_buf, begin(output), end(output));
 224
 225     bool use_same_memory = true;
 226
 227     for (int dev_id = 0; dev_id < devices.size(); dev_id++)
 228     {
 229       for (int i = 0; i < items_per_device; ++i)
 230       {
 231         int output_idx = items_per_device * dev_id + i;
 232         if (output[output_idx] != input[i] + dev_id)
 233         {
 234           std::cout << "Output[" << output_idx << "] : "
 235                     << "expected = " << input[i] + dev_id << "; actual = " << output[output_idx]
 236                     << "\n";
 237           use_same_memory = false;
 238           break;
 239         }
 240       }
 241     }
 242
 243     if (use_same_memory)
 244       std::cout << "\n=> Mapped memory addresses used by devices in GPU are same.\n\n";
 245     else
 246       std::cout << "\n=> Mapped memory addresses used by devices in GPU are different.\n\n";
 247   }
 248   catch (cl::Error &err)
 249   {
 250     std::cerr << "error: code: " << err.err() << ", what: " << err.what() << std::endl;
 251   }
 252 }
 253
 254 void printHelp()
 255 {
 256   std::cout << "opencl information: \n\n";
 257   std::cout << "\t -h : help\n";
 258   std::cout
 259       << "\t -g : print if memory map is shared among devices in GPU (in default platform)\n\n";
 260   std::cout << "\t -s : test for synchronized work by two devices in a GPU\n\n";
 261 }
 262
 263 #include <mutex>
 264 #include <chrono>
 265 #include <thread>
 266 #include <condition_variable>
 267
 268 #define MAX_DEVICE_NUM 8 // just for testing
 269
 270 int kernel_idx[MAX_DEVICE_NUM];
 271 unsigned char kernel_completed = 0x00; // bit 0 = 1 means kernel by device[0] was completed.
 272 unsigned char
 273     kernel_completed_flag; // if comparing kernel_completed with this var, all kernels are completed
 274 int device_num;
 275 std::mutex kernel_complete_handler_mutex;
 276
 277 std::condition_variable wakeup_main;
 278 std::mutex wakeup_main_mutex;
 279
 280 void notifyKernelFinished(cl_event ev, cl_int ev_info, void *device_idx)
 281 {
 282   std::cout << "callback from device[" << *((int *)device_idx) << "] : ==> completed.\n";
 283
 284   std::unique_lock<std::mutex> lock(kernel_complete_handler_mutex);
 285
 286   kernel_completed |= 0x01 << *((int *)device_idx);
 287   if (kernel_completed == kernel_completed_flag)
 288     wakeup_main.notify_one();
 289 }
 290
 291 void testSync()
 292 {
 293   OpenCLGpu gpu;
 294
 295   cl_int cl_error;
 296   typedef cl_int T;
 297   const int items_per_device = 1024 * 768;
 298   const int length = items_per_device * gpu.devices_.size();
 299
 300   std::vector<T> output(length, 0);
 301
 302   cl::Buffer output_buf(gpu.context_, (cl_mem_flags)CL_MEM_USE_HOST_PTR, length * sizeof(T),
 303                         output.data(), &cl_error);
 304
 305   std::string kernel_source{"kernel void test(global float* output, const int count)  \n"
 306                             "{                                                        \n"
 307                             "   int idx = get_global_id(0);                           \n"
 308                             "   if(idx < count)                                       \n"
 309                             "   {                                                     \n"
 310                             "       float x = hypot(idx/1.111, idx*1.111);            \n"
 311                             "       for (int y = 0; y < 200; y++)                     \n"
 312                             "         x = rootn(log(pown(rootn(log(pown(x, 20)), 5), 20)), 5);  \n"
 313                             "       output[idx] = x;                                  \n"
 314                             "   }                                                     \n"
 315                             "}                                                        \n"};
 316
 317   gpu.buildProgram(kernel_source);
 318
 319   try
 320   {
 321     auto kernel_functor = cl::KernelFunctor<cl::Buffer, cl_int>(
 322         gpu.program_, "test"); // name should be same as cl function name
 323
 324     // variable init
 325     cl::Event ev[MAX_DEVICE_NUM];
 326
 327     device_num = gpu.devices_.size();
 328
 329     kernel_completed = 0;
 330     kernel_completed_flag = 0;
 331     for (int i = 0; i < device_num; i++)
 332     {
 333       kernel_idx[i] = i;
 334       kernel_completed_flag |= 0x01 << i;
 335     }
 336
 337     // create a queue per device and queue a kernel job
 338     // queueing with callback function
 339     for (int dev_id = 0; dev_id < gpu.devices_.size(); dev_id++)
 340     {
 341       ev[dev_id] = kernel_functor(cl::EnqueueArgs(*(gpu.q_[dev_id]), cl::NDRange(items_per_device)),
 342                                   output_buf,
 343                                   (cl_int)(items_per_device), // count
 344                                   cl_error);
 345       ev[dev_id].setCallback(CL_COMPLETE, notifyKernelFinished, (void *)(kernel_idx + dev_id));
 346
 347       // how to check kernel execution status
 348       //
 349       // auto status  = ev[dev_id].getInfo<CL_EVENT_COMMAND_EXECUTION_STATUS>();
 350       // std::cout << "Event status = " << (status == CL_QUEUED ? "CL_QUEUED" : status ==
 351       // CL_SUBMITTED ? "CL_SUBMITTED" : status == CL_COMPLETE ? "CL_COMPLETE" : "unknown")
 352       //           << std::endl;
 353       // std::cout << "Event status code = " << status << std::endl;
 354     }
 355
 356     // long wait until kernels are over
 357     {
 358       std::unique_lock<std::mutex> lk(wakeup_main_mutex);
 359       wakeup_main.wait(lk, [] { return (kernel_completed == kernel_completed_flag); });
 360
 361       std::cout << "all devices were completed.\n";
 362     }
 363   }
 364   catch (cl::Error &err)
 365   {
 366     std::cerr << "error: code: " << err.err() << ", what: " << err.what() << std::endl;
 367   }
 368 }
 369
 370 int main(const int argc, char **argv)
 371 {
 372   if (argc < 2)
 373     printHelp();
 374   else
 375   {
 376     std::string option = argv[1];
 377
 378     if (option == "-h") // help
 379       printHelp();
 380     else if (option == "-g") // check if devices in GPU uses same memory address
 381       checkContextMem();
 382     else if (option == "-s") // check synchronization between devices in GPU
 383       testSync();
 384   }
 385   return 0;
 386 }