namespace caffe {
template <typename Dtype>
-__global__ void LRNFillScale(const int nthreads, const Dtype* in,
+__global__ void LRNFillScale(const int nthreads, const Dtype* const in,
const int num, const int channels, const int height,
const int width, const int size, const Dtype alpha_over_size,
- const Dtype k, Dtype* scale) {
+ const Dtype k, Dtype* const scale) {
CUDA_KERNEL_LOOP(index, nthreads) {
// find out the local offset
- int w = index % width;
- int h = (index / width) % height;
- int n = index / width / height;
- int offset = (n * channels * height + h) * width + w;
- int step = height * width;
+ const int w = index % width;
+ const int h = (index / width) % height;
+ const int n = index / width / height;
+ const int offset = (n * channels * height + h) * width + w;
+ const int step = height * width;
const Dtype* const in_off = in + offset;
Dtype* const scale_off = scale + offset;
int head = 0;
- int pre_pad = (size - 1) / 2;
- int post_pad = size - pre_pad - 1;
+ const int pre_pad = (size - 1) / 2;
+ const int post_pad = size - pre_pad - 1;
Dtype accum_scale = 0;
// fill the scale at [n, :, h, w]
// accumulate values
// TODO: check if it would be faster to just put it into the previous kernel.
template <typename Dtype>
-__global__ void LRNComputeOutput(const int nthreads, const Dtype* in,
- const Dtype* scale, const Dtype negative_beta, Dtype* out) {
+__global__ void LRNComputeOutput(const int nthreads, const Dtype* const in,
+ const Dtype* const scale, const Dtype negative_beta, Dtype* const out) {
CUDA_KERNEL_LOOP(index, nthreads) {
out[index] = in[index] * pow(scale[index], negative_beta);
}
}
template <typename Dtype>
-__global__ void LRNComputeDiff(const int nthreads, const Dtype* bottom_data,
- const Dtype* top_data, const Dtype* scale, const Dtype* top_diff,
+__global__ void LRNComputeDiff(const int nthreads,
+ const Dtype* const bottom_data, const Dtype* const top_data,
+ const Dtype* const scale, const Dtype* const top_diff,
const int num, const int channels, const int height,
const int width, const int size, const Dtype negative_beta,
- const Dtype cache_ratio,
- Dtype* bottom_diff) {
+ const Dtype cache_ratio, Dtype* const bottom_diff) {
CUDA_KERNEL_LOOP(index, nthreads) {
// find out the local offset
- int w = index % width;
- int h = (index / width) % height;
- int n = index / width / height;
- int offset = (n * channels * height + h) * width + w;
- int step = height * width;
+ const int w = index % width;
+ const int h = (index / width) % height;
+ const int n = index / width / height;
+ const int offset = (n * channels * height + h) * width + w;
+ const int step = height * width;
const Dtype* const bottom_off = bottom_data + offset;
const Dtype* const top_off = top_data + offset;
const Dtype* const scale_off = scale + offset;
const Dtype* const top_diff_off = top_diff + offset;
Dtype* const bottom_diff_off = bottom_diff + offset;
int head = 0;
- int pre_pad = size - (size + 1) / 2;
- int post_pad = size - pre_pad - 1;
+ const int pre_pad = size - (size + 1) / 2;
+ const int post_pad = size - pre_pad - 1;
Dtype accum_ratio = 0;
// accumulate values
while (head < post_pad && head < channels) {