[enhance] Registering OpenCL kernels at cl_context
authorDebadri Samaddar <s.debadri@samsung.com>
Wed, 11 Sep 2024 08:05:22 +0000 (13:35 +0530)
committerJijoong Moon <jijoong.moon@samsung.com>
Sun, 22 Sep 2024 23:06:56 +0000 (08:06 +0900)
Register custom kernels as well as in-house kernels at cl_context initialization

Signed-off-by: Debadri Samaddar <s.debadri@samsung.com>
api/ccapi/include/layer.h
api/ccapi/src/factory.cpp
nntrainer/cl_context.cpp
nntrainer/cl_context.h
nntrainer/opencl/opencl_command_queue_manager.cpp
nntrainer/opencl/opencl_command_queue_manager.h
nntrainer/tensor/cl_operations/blas_kernels.cpp
nntrainer/tensor/cl_operations/blas_kernels.h

index 3740500aa6ff7e18d817489309a4771016fa1b08..d9f9cffdd25d73375778bb03e5a14726d2531ecb 100644 (file)
@@ -266,7 +266,8 @@ createLayer(const LayerType &type,
  */
 std::unique_ptr<Layer>
 createLayer(const std::string &type,
-            const std::vector<std::string> &properties = {});
+            const std::vector<std::string> &properties = {},
+            const LayerComputeEngine &compute_engine = LayerComputeEngine::CPU);
 
 /**
  * @brief General Layer Factory function to register Layer
index e0e6fc24a8943f26e4de1db2b254d6bf0344f7aa..5f2b2dd2b9bc9c7f3c117b3ee2e7c132ccbcf05d 100644 (file)
@@ -40,8 +40,9 @@ std::unique_ptr<Layer> createLayer(const LayerType &type,
  * @brief Factory creator with constructor for layer
  */
 std::unique_ptr<Layer> createLayer(const std::string &type,
-                                   const std::vector<std::string> &properties) {
-  return nntrainer::createLayerNode(type, properties);
+                                   const std::vector<std::string> &properties,
+                                   const LayerComputeEngine &compute_engine) {
+  return nntrainer::createLayerNode(type, properties, compute_engine);
 }
 
 std::unique_ptr<Optimizer>
index a1288cbdc0946296067aed2d0754885d7aeb09cc..9e28e47d06567f1d08e5021cfffaf1eabc26386c 100644 (file)
@@ -119,6 +119,77 @@ const int ClContext::registerFactory(const FactoryType<T> factory,
   return assigned_int_key;
 }
 
+const ClContext::SharedPtrClKernel &
+ClContext::registerClKernel(std::string kernel_string,
+                            std::string kernel_name) {
+  // check if created before
+  if (ocl_kernel_map.find(kernel_name) != ocl_kernel_map.end()) {
+    ml_logi("Kernel already registered and initialized: %s",
+            kernel_name.c_str());
+    return ocl_kernel_map[kernel_name];
+  }
+
+  // creating shared_ptr for kernel object
+  SharedPtrClKernel kernelPtr = std::make_shared<opencl::Kernel>();
+  if (!clCreateKernel(kernel_string, kernel_name, kernelPtr)) {
+    ml_loge("Failed to register kernel %s", kernel_name.c_str());
+    return nullptr;
+  }
+  // add to map
+  ocl_kernel_map.emplace(kernel_name, kernelPtr);
+  return ocl_kernel_map[kernel_name];
+}
+
+bool ClContext::clCreateKernel(std::string &kernel_string,
+                               std::string &kernel_name,
+                               const SharedPtrClKernel &kernel_ptr_) {
+
+  ml_logi("Kernel initializing: %s", kernel_name.c_str());
+
+  bool result = false;
+
+  do {
+    opencl::Program program;
+
+    // reading binary
+    std::ifstream fs(opencl::Program::DEFAULT_KERNEL_PATH + "/" + kernel_name +
+                       "_kernel.bin",
+                     std::ios::binary | std::ios::in);
+
+    if (fs.good()) {
+      fs.seekg(0, std::ios::end);
+      size_t binary_size = fs.tellg();
+      fs.seekg(0, std::ios::beg);
+
+      unsigned char chunk[binary_size];
+      fs.read((char *)chunk, binary_size);
+
+      result = program.CreateCLProgramWithBinary(
+        context_inst_.GetContext(), context_inst_.GetDeviceId(), binary_size,
+        chunk,
+        opencl::Program::DEFAULT_KERNEL_PATH + "/" + kernel_name +
+          "_kernel.bin",
+        "");
+    } else {
+      result =
+        program.CreateCLProgram(context_inst_.GetContext(),
+                                context_inst_.GetDeviceId(), kernel_string, "");
+    }
+
+    if (!result) {
+      break;
+    }
+
+    result = kernel_ptr_->CreateKernelFromProgram(program, kernel_name);
+    if (!result) {
+      break;
+    }
+
+  } while (false);
+
+  return result;
+}
+
 /**
  * @copydoc const int ClContext::registerFactory
  */
index 31e8d7e7dd20deb7ddcf8d5f998cccf96948474b..6beee993fb57251ab4e57e35167687c0f5da375a 100644 (file)
@@ -30,6 +30,9 @@
 #include <layer_devel.h>
 
 #include <opencl_command_queue_manager.h>
+#include <opencl_context_manager.h>
+#include <opencl_kernel.h>
+#include <opencl_program.h>
 
 #include <nntrainer_log.h>
 
@@ -49,6 +52,8 @@ public:
 
   template <typename T> using PtrType = std::unique_ptr<T>;
 
+  using SharedPtrClKernel = std::shared_ptr<opencl::Kernel>;
+
   template <typename T>
   using FactoryType = std::function<PtrType<T>(const PropsType &)>;
 
@@ -61,6 +66,9 @@ public:
   /** integer to string key */
   using IntIndexType = std::unordered_map<int, std::string>;
 
+  /** string to kernel pointer map*/
+  using OclKernelMap = std::unordered_map<std::string, SharedPtrClKernel>;
+
   /**
    * This type contains tuple of
    * 1) integer -> string index
@@ -71,6 +79,12 @@ public:
 
   template <typename... Ts> using FactoryMap = std::tuple<IndexType<Ts>...>;
 
+  // getting static instance of commandqueue and opencl context
+  opencl::CommandQueueManager &command_queue_inst_ =
+    opencl::CommandQueueManager::GetInstance();
+
+  opencl::ContextManager &context_inst_ = opencl::ContextManager::GetInstance();
+
   /**
    * @brief   Default constructor
    */
@@ -183,12 +197,23 @@ public:
     return entry->second(props);
   }
 
+  /**
+   * @brief register or return already present OpenCl kernel pointer
+   * @param kernel_string kernel implementation string
+   * @param kernel_name kernel name
+   * @return reference of std::shared_ptr<opencl::Kernel>
+   */
+  const SharedPtrClKernel &registerClKernel(std::string kernel_string,
+                                            std::string kernel_name);
+
   /**
    * @brief destructor to release opencl commandQueue
    */
   ~ClContext() {
     if (cl_initialized) {
       command_queue_inst_.ReleaseCommandQueue();
+      // getContext() is called by clCreateKernel
+      context_inst_.ReleaseContext();
     }
   };
 
@@ -200,6 +225,9 @@ private:
 
   template <typename Args, typename T> struct isSupportedHelper;
 
+  // map to store initialized opencl::Kernel
+  OclKernelMap ocl_kernel_map;
+
   /**
    * @brief supportHelper to check if given type is supported within cl context
    */
@@ -215,10 +243,6 @@ private:
   template <typename T>
   struct isSupported : isSupportedHelper<T, decltype(factory_map)> {};
 
-  // getting static instance of commandqueue
-  opencl::CommandQueueManager &command_queue_inst_ =
-    opencl::CommandQueueManager::GetInstance();
-
   /**
    * @brief Initialize opencl commandqueue and context
    * @return true if OpenCL context and command queue creation is successful,
@@ -235,6 +259,16 @@ private:
     cl_initialized = result;
     return cl_initialized;
   };
+
+  /**
+   * @brief create OpenCl kernel
+   * @param kernel_string reference of implementation string
+   * @param kernel_name reference of kernel_name
+   * @param kernel_ptr_ reference of shared_ptr of Kernel
+   * @return true if successful, false otherwise
+   */
+  bool clCreateKernel(std::string &kernel_string, std::string &kernel_name,
+                      const SharedPtrClKernel &kernel_ptr_);
 };
 
 /**
index c55ef0c0088bb903e7240a402f784ad4d980e6fd..109d83e78a0895b460d2691b8602edbf55dc0112 100644 (file)
@@ -202,4 +202,34 @@ bool CommandQueueManager::DispatchCommand(Kernel kernel,
   return true;
 }
 
+bool CommandQueueManager::DispatchCommand(
+  const std::shared_ptr<Kernel> &kernel_ptr, const int (&work_groups_count)[3],
+  const int (&work_group_size)[3], cl_event *event) {
+
+  // work_dim of 2 has been hardcoded, might be modified later based on
+  // requirements
+
+  // setting the local_work_size referred to as the size of the
+  // work-group
+  const size_t local[2] = {static_cast<size_t>(work_group_size[0]),
+                           static_cast<size_t>(work_group_size[1])};
+
+  // setting the global_work_size that describe the number of global work-items
+  const size_t global[2] = {static_cast<size_t>(work_groups_count[0]),
+                            static_cast<size_t>(work_groups_count[1])};
+
+  cl_kernel kernel_ = kernel_ptr->GetKernel();
+
+  // returns NULL with error code if fails
+  const int error_code = clEnqueueNDRangeKernel(
+    command_queue_, kernel_, 2, nullptr, global, local, 0, nullptr, event);
+  if (error_code != CL_SUCCESS) {
+    ml_loge("Failed to clEnqueueNDRangeKernel. OpenCL error code: %d",
+            error_code);
+    return false;
+  }
+
+  return true;
+}
+
 } // namespace nntrainer::opencl
index 5da2a570db06639767eea89733df48bb0b18aaf1..1e98f202f55300bf494367ee90c0b95d3fabaa57 100644 (file)
@@ -16,6 +16,7 @@
 
 #include "opencl_kernel.h"
 #include "third_party/cl.h"
+#include <memory>
 
 namespace nntrainer::opencl {
 
@@ -99,6 +100,22 @@ public:
                        const int (&work_group_size)[3],
                        cl_event *event = nullptr);
 
+  /**
+   * @brief Overloaded function to initiate execution of the command queue.
+   *
+   * @param kernel_ptr reference of OpenCL kernel shared_ptr
+   * @param work_groups_count Total number of work items that will execute the
+   * kernel function
+   * @param work_group_size Number of work items that make up a work group
+   * @param event Object that identifies this command and can be used to query
+   * or wait for this command to complete
+   * @return true if command queue execution is successful or false otherwise
+   */
+  bool DispatchCommand(const std::shared_ptr<Kernel> &kernel_ptr,
+                       const int (&work_groups_count)[3],
+                       const int (&work_group_size)[3],
+                       cl_event *event = nullptr);
+
   /**
    * @brief Get the OpenCL Command Queue object
    *
index 5c0d1dfa72abcb2a99fe751da03fa7272d54d167..c7df348d0485d40cff59ed845f65a16d371c61b3 100644 (file)
@@ -459,9 +459,11 @@ void sscal_cl(float *X, const unsigned int N, const float alpha,
   bool result = false;
 
   do {
-    result = context.clCreateKernel(sscal_cl_kernel_,
-                                    context.LayerKernel::SSCAL, kernel_sscal);
-    if (!result) {
+    auto &cc = ClContext::Global();
+    ClContext::SharedPtrClKernel kernel_ptr =
+      cc.registerClKernel(sscal_cl_kernel_, "sscal_cl");
+
+    if (!kernel_ptr) {
       break;
     }
 
@@ -474,12 +476,12 @@ void sscal_cl(float *X, const unsigned int N, const float alpha,
       break;
     }
 
-    result = kernel_sscal.SetKernelArguments(0, &inputX, sizeof(cl_mem));
+    result = kernel_ptr->SetKernelArguments(0, &inputX, sizeof(cl_mem));
     if (!result) {
       break;
     }
 
-    result = kernel_sscal.SetKernelArguments(1, &alpha, sizeof(float));
+    result = kernel_ptr->SetKernelArguments(1, &alpha, sizeof(float));
     if (!result) {
       break;
     }
@@ -488,7 +490,7 @@ void sscal_cl(float *X, const unsigned int N, const float alpha,
     const int work_group_size[3] = {32, 32, 1}; // test-value
 
     result = context.command_queue_inst_.DispatchCommand(
-      kernel_sscal, work_groups_count, work_group_size);
+      kernel_ptr, work_groups_count, work_group_size);
     if (!result) {
       break;
     }
index 008345eef27d5c6add6f7498b01a2bcb492d2d25..d99fdb15b119f6a91dccd3ab1d6da2466d47ef9d 100644 (file)
@@ -14,6 +14,7 @@
 #ifndef __BLAS_KERNELS_H__
 #define __BLAS_KERNELS_H__
 
+#include <cl_context.h>
 #include <layer_context.h>
 #include <opencl_buffer.h>
 #include <opencl_kernel.h>