Kernel Optimized for GPU. Some trivial changes in code.
Signed-off-by: Yash Singh <yash.singh@samsung.com>
* @param[out] freqs base frequencies array to be used in the future computation
* @param[in] theta rotary angle
*/
-void precompute_freqs(int dim, unsigned int seq_len,
+void precompute_freqs(unsigned int dim, unsigned int seq_len,
std::vector<std::vector<float>> &freqs_cos,
std::vector<std::vector<float>> &freqs_sin,
std::vector<float> &freqs, float theta = 10000.0) {
freqs.push_back(1.0 / (std::pow(theta, (2 * i) / static_cast<float>(dim))));
}
- auto cos = std::vector<std::vector<float>>();
- cos.assign(seq_len, std::vector<float>(dim, 0));
+ auto cos_vec = std::vector<std::vector<float>>();
+ cos_vec.assign(seq_len, std::vector<float>(dim, 0));
- auto sin = std::vector<std::vector<float>>();
- sin.assign(seq_len, std::vector<float>(dim, 0));
+ auto sin_vec = std::vector<std::vector<float>>();
+ sin_vec.assign(seq_len, std::vector<float>(dim, 0));
for (unsigned int i = 0; i < seq_len; ++i) {
for (unsigned int j = 0; j < half_; ++j) {
float angle = i * freqs[j];
- cos[i][j] = std::cos(angle);
- cos[i][j + half_] = std::cos(angle); // repeated 2 times
+ cos_vec[i][j] = std::cos(angle);
+ cos_vec[i][j + half_] = std::cos(angle); // repeated 2 times
- sin[i][j] = std::sin(angle);
- sin[i][j + half_] = std::sin(angle); // repeated 2 times
+ sin_vec[i][j] = std::sin(angle);
+ sin_vec[i][j + half_] = std::sin(angle); // repeated 2 times
}
}
- freqs_cos = cos;
- freqs_sin = sin;
+ freqs_cos = cos_vec;
+ freqs_sin = sin_vec;
}
/**
* @param[in] dim hidden dim size
* @param[in] from sequence order
* @param[in] max_timestep maximum timestep
+ * @param[in] context layer context to get the resource manager and queue id
+ *
+ * @todo Calling precompute_freqs in finalize to reduce code redundancy.
*/
void apply_rotary_emb_cl(Tensor &in, unsigned int dim, unsigned int from,
unsigned int max_timestep, RunLayerContext &context) {
nntrainer::Tensor out(in.getDim());
- float value = 0;
- float transformed_value = 0.0;
+ float value = 0.0f;
+ float transformed_value = 0.0f;
unsigned int half_ = dim / 2;
std::vector<std::vector<float>> freqs_cos = {};
unsigned int half_,
unsigned int max_timestep,
unsigned int from) {
- unsigned int gid = get_global_id(0);
- unsigned int gws = get_global_size(0);
-
__global float *cos_ptr = cos_;
__global float *sin_ptr = sin_;
float value = 0.0f;
float transformed_value = 0.0f;
- for (unsigned int b = 0; b < batch; b++) {
- for (unsigned int c = 0; c < channel; c++) {
- for (unsigned int h = 0; h < height; h++) {
- if (from + h < max_timestep) {
- unsigned idx = (from + h)*dim;
- for(unsigned int i = idx; i < idx + dim; i++){
- cos_ptr[i - idx] = freqs_cos[i];
- sin_ptr[i - idx] = freqs_sin[i];
- }
+ unsigned int b = get_global_id(0);
+ unsigned int c = get_global_id(1);
+
+ if(b < batch && c < channel){
+ for (unsigned int h = 0; h < height; h++) {
+ if (from + h < max_timestep) {
+ unsigned idx = (from + h)*dim;
+ for(unsigned int i = idx; i < idx + dim; i++){
+ cos_ptr[i - idx] = freqs_cos[i];
+ sin_ptr[i - idx] = freqs_sin[i];
}
- for (unsigned int w = 0; w < width; w = w + dim) {
- for (unsigned int k = 0; k < dim; k++) {
- unsigned int span = w + k;
- value = input[b * channel * height * width + c * height * width + h * width + span];
- if (k < half_) {
- transformed_value = -1.0f * input[b * channel * height * width + c * height * width + h * width + span + half_];
- } else {
- transformed_value = input[b * channel * height * width + c * height * width + h * width + span - half_];
- }
- value = value * cos_ptr[k] + transformed_value * sin_ptr[k];
- output[b * channel * height * width + c * height * width + h * width + span] = value;
+ }
+
+ for (unsigned int w = 0; w < width; w = w + dim) {
+ for (unsigned int k = 0; k < dim; k++) {
+ unsigned int span = w + k;
+ value = input[b * channel * height * width + c * height * width + h * width + span];
+ if (k < half_) {
+ transformed_value = -1.0f * input[b * channel * height * width + c * height * width + h * width + span + half_];
+ } else {
+ transformed_value = input[b * channel * height * width + c * height * width + h * width + span - half_];
}
+ value = value * cos_ptr[k] + transformed_value * sin_ptr[k];
+ output[b * channel * height * width + c * height * width + h * width + span] = value;
}
}
}
break;
}
- const int work_groups_count[3] = {1, 1, 1};
- const int work_group_size[3] = {32, 1, 1}; // test-value
+ const int work_groups_count[3] = {(int)batch, (int)channel, 1};
+ const int work_group_size[3] = {32, 32, 1}; // test-value
result = context.command_queue_inst_.DispatchCommand(
kernel_rotary_emb, work_groups_count, work_group_size);
if (!result) {
unsigned int half_,
unsigned int max_timestep,
unsigned int from) {
- unsigned int gid = get_global_id(0);
- unsigned int gws = get_global_size(0);
-
__global float *cos_ptr = cos_;
__global float *sin_ptr = sin_;
float value = 0.0f;
float transformed_value = 0.0f;
- for (unsigned int b = 0; b < batch; b++) {
- for (unsigned int c = 0; c < channel; c++) {
- for (unsigned int h = 0; h < height; h++) {
- if (from + h < max_timestep) {
- unsigned idx = (from + h)*dim;
- for(int i = idx; i < idx + dim; i++ ){
- cos_ptr[i - idx] = freqs_cos[i];
- sin_ptr[i - idx] = freqs_sin[i];
- }
+ unsigned int b = get_global_id(0);
+ unsigned int c = get_global_id(1);
+
+ if(b < batch && c < channel){
+ for (unsigned int h = 0; h < height; h++) {
+ if (from + h < max_timestep) {
+ unsigned idx = (from + h)*dim;
+ for(int i = idx; i < idx + dim; i++ ){
+ cos_ptr[i - idx] = freqs_cos[i];
+ sin_ptr[i - idx] = freqs_sin[i];
}
+ }
- for (unsigned int w = 0; w < width; w = w + dim) {
- for (unsigned int k = 0; k < dim; k++) {
- unsigned int span = w + k;
- value = (float)input[b * channel * height * width + c * height * width + h * width + span];
- if (k < half_) {
- transformed_value = -1.0f * (float)input[b * channel * height * width + c * height * width + h * width + span + half_];
- } else {
- transformed_value = (float)input[b * channel * height * width + c * height * width + h * width + span - half_];
- }
- value = value * cos_ptr[k] + transformed_value * sin_ptr[k];
- output[b * channel * height * width + c * height * width + h * width + span] = (half)value;
+ for (unsigned int w = 0; w < width; w = w + dim) {
+ for (unsigned int k = 0; k < dim; k++) {
+ unsigned int span = w + k;
+ value = (float)input[b * channel * height * width + c * height * width + h * width + span];
+ if (k < half_) {
+ transformed_value = -1.0f * (float)input[b * channel * height * width + c * height * width + h * width + span + half_];
+ } else {
+ transformed_value = (float)input[b * channel * height * width + c * height * width + h * width + span - half_];
}
+ value = value * cos_ptr[k] + transformed_value * sin_ptr[k];
+ output[b * channel * height * width + c * height * width + h * width + span] = (half)value;
}
}
}
break;
}
- const int work_groups_count[3] = {1, 1, 1};
- const int work_group_size[3] = {32, 1, 1}; // test-value
+ const int work_groups_count[3] = {(int)batch, (int)channel, 1};
+ const int work_group_size[3] = {32, 32, 1}; // test-value
result = context.command_queue_inst_.DispatchCommand(
kernel_rotary_emb_fp16, work_groups_count, work_group_size);
if (!result) {
#include <string>
/**
- * @brief compute frequency for rotary embedding
+ * @brief Testing code for CPU results and compute frequency for rotary
+ * embedding
* @param[in] dim hidden dim size
* @param[in] seq_len sequency length
* @param[out] freqs_cos cosine of the frequencies
* sin values for each position in sequence
* @param[in] theta rotary angle
*/
-void precompute_freqs(int dim, unsigned int seq_len,
+void precompute_freqs(unsigned int dim, unsigned int seq_len,
std::vector<std::vector<float>> &freqs_cos,
std::vector<std::vector<float>> &freqs_sin,
std::vector<float> &freqs, float theta = 10000.0) {
(std::pow(theta, (2 * i) / static_cast<float>(dim))));
}
- auto cos = std::vector<std::vector<float>>();
- cos.assign(seq_len, std::vector<float>(dim, 0));
+ auto cos_vec = std::vector<std::vector<float>>();
+ cos_vec.assign(seq_len, std::vector<float>(dim, 0));
- auto sin = std::vector<std::vector<float>>();
- sin.assign(seq_len, std::vector<float>(dim, 0));
+ auto sin_vec = std::vector<std::vector<float>>();
+ sin_vec.assign(seq_len, std::vector<float>(dim, 0));
for (unsigned int i = 0; i < seq_len; ++i) {
for (unsigned int j = 0; j < half_; ++j) {
float angle = i * freqs[j];
- cos[i][j] = std::cos(angle);
- cos[i][j + half_] = std::cos(angle); // repeated 2 times
+ cos_vec[i][j] = std::cos(angle);
+ cos_vec[i][j + half_] = std::cos(angle); // repeated 2 times
- sin[i][j] = std::sin(angle);
- sin[i][j + half_] = std::sin(angle); // repeated 2 times
+ sin_vec[i][j] = std::sin(angle);
+ sin_vec[i][j + half_] = std::sin(angle); // repeated 2 times
}
}
- freqs_cos = cos;
- freqs_sin = sin;
+ freqs_cos = cos_vec;
+ freqs_sin = sin_vec;
}
}
/**
- * @brief apply rotary embedding
+ * @brief Testing code for CPU results and apply rotary embedding
* @param[in] in input tensor
* @param[in] dim hidden dim size
* @param[in] from sequence order
void apply_rotary_emb_tensor(nntrainer::Tensor &in, unsigned int dim,
unsigned int from, unsigned int max_timestep) {
nntrainer::Tensor out(in.getDim());
- float value = 0;
- float transformed_value = 0.0;
+ float value = 0.0f;
+ float transformed_value = 0.0f;
unsigned int half_ = dim / 2;
std::vector<std::vector<float>> freqs_cos = {};