[enhance] Registering OpenCL kernels at cl_context

author Debadri Samaddar <s.debadri@samsung.com>

Wed, 11 Sep 2024 08:05:22 +0000 (13:35 +0530)

committer Jijoong Moon <jijoong.moon@samsung.com>

Sun, 22 Sep 2024 23:06:56 +0000 (08:06 +0900)
author Debadri Samaddar <s.debadri@samsung.com>
Wed, 11 Sep 2024 08:05:22 +0000 (13:35 +0530)
committer Jijoong Moon <jijoong.moon@samsung.com>
Sun, 22 Sep 2024 23:06:56 +0000 (08:06 +0900)
diff --git a/api/ccapi/include/layer.h b/api/ccapi/include/layer.h

index 3740500aa6ff7e18d817489309a4771016fa1b08..d9f9cffdd25d73375778bb03e5a14726d2531ecb 100644 (file)
--- a/api/ccapi/include/layer.h
+++ b/api/ccapi/include/layer.h
@@ -266,7 +266,8 @@ createLayer(const LayerType &type,
   */
  std::unique_ptr<Layer>
  createLayer(const std::string &type,
-            const std::vector<std::string> &properties = {});
+            const std::vector<std::string> &properties = {},
+            const LayerComputeEngine &compute_engine = LayerComputeEngine::CPU);
  
  /**
   * @brief General Layer Factory function to register Layer
diff --git a/api/ccapi/src/factory.cpp b/api/ccapi/src/factory.cpp

index e0e6fc24a8943f26e4de1db2b254d6bf0344f7aa..5f2b2dd2b9bc9c7f3c117b3ee2e7c132ccbcf05d 100644 (file)
--- a/api/ccapi/src/factory.cpp
+++ b/api/ccapi/src/factory.cpp
@@ -40,8 +40,9 @@ std::unique_ptr<Layer> createLayer(const LayerType &type,
   * @brief Factory creator with constructor for layer
   */
  std::unique_ptr<Layer> createLayer(const std::string &type,
-                                   const std::vector<std::string> &properties) {
-  return nntrainer::createLayerNode(type, properties);
+                                   const std::vector<std::string> &properties,
+                                   const LayerComputeEngine &compute_engine) {
+  return nntrainer::createLayerNode(type, properties, compute_engine);
  }
  
  std::unique_ptr<Optimizer>
diff --git a/nntrainer/cl_context.cpp b/nntrainer/cl_context.cpp

index a1288cbdc0946296067aed2d0754885d7aeb09cc..9e28e47d06567f1d08e5021cfffaf1eabc26386c 100644 (file)
--- a/nntrainer/cl_context.cpp
+++ b/nntrainer/cl_context.cpp
@@ -119,6 +119,77 @@ const int ClContext::registerFactory(const FactoryType<T> factory,
    return assigned_int_key;
  }
  
+const ClContext::SharedPtrClKernel &
+ClContext::registerClKernel(std::string kernel_string,
+                            std::string kernel_name) {
+  // check if created before
+  if (ocl_kernel_map.find(kernel_name) != ocl_kernel_map.end()) {
+    ml_logi("Kernel already registered and initialized: %s",
+            kernel_name.c_str());
+    return ocl_kernel_map[kernel_name];
+  }
+
+  // creating shared_ptr for kernel object
+  SharedPtrClKernel kernelPtr = std::make_shared<opencl::Kernel>();
+  if (!clCreateKernel(kernel_string, kernel_name, kernelPtr)) {
+    ml_loge("Failed to register kernel %s", kernel_name.c_str());
+    return nullptr;
+  }
+  // add to map
+  ocl_kernel_map.emplace(kernel_name, kernelPtr);
+  return ocl_kernel_map[kernel_name];
+}
+
+bool ClContext::clCreateKernel(std::string &kernel_string,
+                               std::string &kernel_name,
+                               const SharedPtrClKernel &kernel_ptr_) {
+
+  ml_logi("Kernel initializing: %s", kernel_name.c_str());
+
+  bool result = false;
+
+  do {
+    opencl::Program program;
+
+    // reading binary
+    std::ifstream fs(opencl::Program::DEFAULT_KERNEL_PATH + "/" + kernel_name +
+                       "_kernel.bin",
+                     std::ios::binary | std::ios::in);
+
+    if (fs.good()) {
+      fs.seekg(0, std::ios::end);
+      size_t binary_size = fs.tellg();
+      fs.seekg(0, std::ios::beg);
+
+      unsigned char chunk[binary_size];
+      fs.read((char *)chunk, binary_size);
+
+      result = program.CreateCLProgramWithBinary(
+        context_inst_.GetContext(), context_inst_.GetDeviceId(), binary_size,
+        chunk,
+        opencl::Program::DEFAULT_KERNEL_PATH + "/" + kernel_name +
+          "_kernel.bin",
+        "");
+    } else {
+      result =
+        program.CreateCLProgram(context_inst_.GetContext(),
+                                context_inst_.GetDeviceId(), kernel_string, "");
+    }
+
+    if (!result) {
+      break;
+    }
+
+    result = kernel_ptr_->CreateKernelFromProgram(program, kernel_name);
+    if (!result) {
+      break;
+    }
+
+  } while (false);
+
+  return result;
+}
+
  /**
   * @copydoc const int ClContext::registerFactory
   */
diff --git a/nntrainer/cl_context.h b/nntrainer/cl_context.h

index 31e8d7e7dd20deb7ddcf8d5f998cccf96948474b..6beee993fb57251ab4e57e35167687c0f5da375a 100644 (file)
--- a/nntrainer/cl_context.h
+++ b/nntrainer/cl_context.h
@@ -30,6 +30,9 @@
  #include <layer_devel.h>
  
  #include <opencl_command_queue_manager.h>
+#include <opencl_context_manager.h>
+#include <opencl_kernel.h>
+#include <opencl_program.h>
  
  #include <nntrainer_log.h>
  
@@ -49,6 +52,8 @@ public:
  
    template <typename T> using PtrType = std::unique_ptr<T>;
  
+  using SharedPtrClKernel = std::shared_ptr<opencl::Kernel>;
+
    template <typename T>
    using FactoryType = std::function<PtrType<T>(const PropsType &)>;
  
@@ -61,6 +66,9 @@ public:
    /** integer to string key */
    using IntIndexType = std::unordered_map<int, std::string>;
  
+  /** string to kernel pointer map*/
+  using OclKernelMap = std::unordered_map<std::string, SharedPtrClKernel>;
+
    /**
     * This type contains tuple of
     * 1) integer -> string index
@@ -71,6 +79,12 @@ public:
  
    template <typename... Ts> using FactoryMap = std::tuple<IndexType<Ts>...>;
  
+  // getting static instance of commandqueue and opencl context
+  opencl::CommandQueueManager &command_queue_inst_ =
+    opencl::CommandQueueManager::GetInstance();
+
+  opencl::ContextManager &context_inst_ = opencl::ContextManager::GetInstance();
+
    /**
     * @brief   Default constructor
     */
@@ -183,12 +197,23 @@ public:
      return entry->second(props);
    }
  
+  /**
+   * @brief register or return already present OpenCl kernel pointer
+   * @param kernel_string kernel implementation string
+   * @param kernel_name kernel name
+   * @return reference of std::shared_ptr<opencl::Kernel>
+   */
+  const SharedPtrClKernel &registerClKernel(std::string kernel_string,
+                                            std::string kernel_name);
+
    /**
     * @brief destructor to release opencl commandQueue
     */
    ~ClContext() {
      if (cl_initialized) {
        command_queue_inst_.ReleaseCommandQueue();
+      // getContext() is called by clCreateKernel
+      context_inst_.ReleaseContext();
      }
    };
  
@@ -200,6 +225,9 @@ private:
  
    template <typename Args, typename T> struct isSupportedHelper;
  
+  // map to store initialized opencl::Kernel
+  OclKernelMap ocl_kernel_map;
+
    /**
     * @brief supportHelper to check if given type is supported within cl context
     */
@@ -215,10 +243,6 @@ private:
    template <typename T>
    struct isSupported : isSupportedHelper<T, decltype(factory_map)> {};
  
-  // getting static instance of commandqueue
-  opencl::CommandQueueManager &command_queue_inst_ =
-    opencl::CommandQueueManager::GetInstance();
-
    /**
     * @brief Initialize opencl commandqueue and context
     * @return true if OpenCL context and command queue creation is successful,
@@ -235,6 +259,16 @@ private:
      cl_initialized = result;
      return cl_initialized;
    };
+
+  /**
+   * @brief create OpenCl kernel
+   * @param kernel_string reference of implementation string
+   * @param kernel_name reference of kernel_name
+   * @param kernel_ptr_ reference of shared_ptr of Kernel
+   * @return true if successful, false otherwise
+   */
+  bool clCreateKernel(std::string &kernel_string, std::string &kernel_name,
+                      const SharedPtrClKernel &kernel_ptr_);
  };
  
  /**
diff --git a/nntrainer/opencl/opencl_command_queue_manager.cpp b/nntrainer/opencl/opencl_command_queue_manager.cpp

index c55ef0c0088bb903e7240a402f784ad4d980e6fd..109d83e78a0895b460d2691b8602edbf55dc0112 100644 (file)
--- a/nntrainer/opencl/opencl_command_queue_manager.cpp
+++ b/nntrainer/opencl/opencl_command_queue_manager.cpp
@@ -202,4 +202,34 @@ bool CommandQueueManager::DispatchCommand(Kernel kernel,
    return true;
  }
  
+bool CommandQueueManager::DispatchCommand(
+  const std::shared_ptr<Kernel> &kernel_ptr, const int (&work_groups_count)[3],
+  const int (&work_group_size)[3], cl_event *event) {
+
+  // work_dim of 2 has been hardcoded, might be modified later based on
+  // requirements
+
+  // setting the local_work_size referred to as the size of the
+  // work-group
+  const size_t local[2] = {static_cast<size_t>(work_group_size[0]),
+                           static_cast<size_t>(work_group_size[1])};
+
+  // setting the global_work_size that describe the number of global work-items
+  const size_t global[2] = {static_cast<size_t>(work_groups_count[0]),
+                            static_cast<size_t>(work_groups_count[1])};
+
+  cl_kernel kernel_ = kernel_ptr->GetKernel();
+
+  // returns NULL with error code if fails
+  const int error_code = clEnqueueNDRangeKernel(
+    command_queue_, kernel_, 2, nullptr, global, local, 0, nullptr, event);
+  if (error_code != CL_SUCCESS) {
+    ml_loge("Failed to clEnqueueNDRangeKernel. OpenCL error code: %d",
+            error_code);
+    return false;
+  }
+
+  return true;
+}
+
  } // namespace nntrainer::opencl
diff --git a/nntrainer/opencl/opencl_command_queue_manager.h b/nntrainer/opencl/opencl_command_queue_manager.h

index 5da2a570db06639767eea89733df48bb0b18aaf1..1e98f202f55300bf494367ee90c0b95d3fabaa57 100644 (file)
--- a/nntrainer/opencl/opencl_command_queue_manager.h
+++ b/nntrainer/opencl/opencl_command_queue_manager.h
@@ -16,6 +16,7 @@
  
  #include "opencl_kernel.h"
  #include "third_party/cl.h"
+#include <memory>
  
  namespace nntrainer::opencl {
  
@@ -99,6 +100,22 @@ public:
                         const int (&work_group_size)[3],
                         cl_event *event = nullptr);
  
+  /**
+   * @brief Overloaded function to initiate execution of the command queue.
+   *
+   * @param kernel_ptr reference of OpenCL kernel shared_ptr
+   * @param work_groups_count Total number of work items that will execute the
+   * kernel function
+   * @param work_group_size Number of work items that make up a work group
+   * @param event Object that identifies this command and can be used to query
+   * or wait for this command to complete
+   * @return true if command queue execution is successful or false otherwise
+   */
+  bool DispatchCommand(const std::shared_ptr<Kernel> &kernel_ptr,
+                       const int (&work_groups_count)[3],
+                       const int (&work_group_size)[3],
+                       cl_event *event = nullptr);
+
    /**
     * @brief Get the OpenCL Command Queue object
     *
diff --git a/nntrainer/tensor/cl_operations/blas_kernels.cpp b/nntrainer/tensor/cl_operations/blas_kernels.cpp

index 5c0d1dfa72abcb2a99fe751da03fa7272d54d167..c7df348d0485d40cff59ed845f65a16d371c61b3 100644 (file)
--- a/nntrainer/tensor/cl_operations/blas_kernels.cpp
+++ b/nntrainer/tensor/cl_operations/blas_kernels.cpp
@@ -459,9 +459,11 @@ void sscal_cl(float *X, const unsigned int N, const float alpha,
    bool result = false;
  
    do {
-    result = context.clCreateKernel(sscal_cl_kernel_,
-                                    context.LayerKernel::SSCAL, kernel_sscal);
-    if (!result) {
+    auto &cc = ClContext::Global();
+    ClContext::SharedPtrClKernel kernel_ptr =
+      cc.registerClKernel(sscal_cl_kernel_, "sscal_cl");
+
+    if (!kernel_ptr) {
        break;
      }
  
@@ -474,12 +476,12 @@ void sscal_cl(float *X, const unsigned int N, const float alpha,
        break;
      }
  
-    result = kernel_sscal.SetKernelArguments(0, &inputX, sizeof(cl_mem));
+    result = kernel_ptr->SetKernelArguments(0, &inputX, sizeof(cl_mem));
      if (!result) {
        break;
      }
  
-    result = kernel_sscal.SetKernelArguments(1, &alpha, sizeof(float));
+    result = kernel_ptr->SetKernelArguments(1, &alpha, sizeof(float));
      if (!result) {
        break;
      }
@@ -488,7 +490,7 @@ void sscal_cl(float *X, const unsigned int N, const float alpha,
      const int work_group_size[3] = {32, 32, 1}; // test-value
  
      result = context.command_queue_inst_.DispatchCommand(
-      kernel_sscal, work_groups_count, work_group_size);
+      kernel_ptr, work_groups_count, work_group_size);
      if (!result) {
        break;
      }
diff --git a/nntrainer/tensor/cl_operations/blas_kernels.h b/nntrainer/tensor/cl_operations/blas_kernels.h

index 008345eef27d5c6add6f7498b01a2bcb492d2d25..d99fdb15b119f6a91dccd3ab1d6da2466d47ef9d 100644 (file)
--- a/nntrainer/tensor/cl_operations/blas_kernels.h
+++ b/nntrainer/tensor/cl_operations/blas_kernels.h
@@ -14,6 +14,7 @@
  #ifndef __BLAS_KERNELS_H__
  #define __BLAS_KERNELS_H__
  
+#include <cl_context.h>
  #include <layer_context.h>
  #include <opencl_buffer.h>
  #include <opencl_kernel.h>
author	Debadri Samaddar <s.debadri@samsung.com>
	Wed, 11 Sep 2024 08:05:22 +0000 (13:35 +0530)
committer	Jijoong Moon <jijoong.moon@samsung.com>
	Sun, 22 Sep 2024 23:06:56 +0000 (08:06 +0900)
api/ccapi/include/layer.h		patch \| blob \| history
api/ccapi/src/factory.cpp		patch \| blob \| history
nntrainer/cl_context.cpp		patch \| blob \| history
nntrainer/cl_context.h		patch \| blob \| history
nntrainer/opencl/opencl_command_queue_manager.cpp		patch \| blob \| history
nntrainer/opencl/opencl_command_queue_manager.h		patch \| blob \| history
nntrainer/tensor/cl_operations/blas_kernels.cpp		patch \| blob \| history
nntrainer/tensor/cl_operations/blas_kernels.h		patch \| blob \| history