transformed_value = input[b * channel * height * width + c * height * width + h * width + span - half_];
}
value = value * cos_ptr[k] + transformed_value * sin_ptr[k];
- // printf("GPU Batch: %u, Height: %u, Channel: %u, Width: %u, K: %u, Span: %u, Value: %f, Transformed Value: %f, cos_ptr[k]: %f, sin_ptr[k]: %f\n", b, h, c, w, k, span, value, transformed_value, cos_ptr[k], sin_ptr[k]);
output[b * channel * height * width + c * height * width + h * width + span] = value;
}
}
sin.assign(seq_len, std::vector<float>(dim, 0));
for (unsigned int i = 0; i < seq_len; ++i) {
-#ifdef USE_NEON
- nntrainer::calc_trigonometric_vals_dup(half_, freqs.data(), cos[i].data(),
- sin[i].data(), i);
-#else
for (unsigned int j = 0; j < half_; ++j) {
float angle = i * freqs[j];
cos[i][j] = std::cos(angle);
sin[i][j] = std::sin(angle);
sin[i][j + half_] = std::sin(angle); // repeated 2 times
}
-#endif
}
freqs_cos = cos;
freqs_sin = sin;
if (from >= max_timestep) {
cos_ = std::vector<float>(dim);
sin_ = std::vector<float>(dim);
-#ifdef USE_NEON
- nntrainer::calc_trigonometric_vals_dup(half_, freqs.data(), cos_.data(),
- sin_.data(), from);
-#else
for (unsigned int i = 0; i < half_; ++i) {
float angle = from * freqs[i];
cos_[i] = std::cos(angle);
sin_[i] = std::sin(angle);
sin_[i + half_] = std::sin(angle); // repeated 2 times
}
-#endif
} else {
cos_.resize(max_timestep);
sin_.resize(max_timestep);
transformed_value = in.getValue<float>(b, c, h, span - half_);
}
value = value * cos_[k] + transformed_value * sin_[k];
- // printf("CPU Batch: %u, Channel: %u, Height: %u, Width: %u, K:
- // %u, Span: %u, Value: %f, Transformed Value: %f, cos_ptr[k]:
- // %f, sin_ptr[k]: %f\n ", b, c, h, w, k, span, value,
- // transformed_value, cos_[k], sin_[k]);
out.setValue(b, c, h, span, value);
}
}
B_fp32.copy(A_fp32);
- // std::cout << "\nA_fp32 and B_fp32 before rotary embedding:" << std::endl;
- // for (unsigned int i = 0; i < A_fp32.size(); ++i) {
- // std::cout << "Element " << i << " -> " << *(A_fp32.getData<float>() + i)
- // <<"\t"<<*(B_fp32.getData<float>() + i)<< std::endl;
- // }
-
apply_rotary_emb_cl(A_fp32, dim, from, max_timestep, rc);
apply_rotary_emb_tensor(B_fp32, dim, from, max_timestep);