lib/jxl/convolve_symmetric5.cc

   1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
   2 //
   3 // Use of this source code is governed by a BSD-style
   4 // license that can be found in the LICENSE file.
   5
   6 #include "lib/jxl/convolve.h"
   7
   8 #undef HWY_TARGET_INCLUDE
   9 #define HWY_TARGET_INCLUDE "lib/jxl/convolve_symmetric5.cc"
  10 #include <hwy/foreach_target.h>
  11 #include <hwy/highway.h>
  12
  13 #include "lib/jxl/base/common.h"
  14 #include "lib/jxl/convolve-inl.h"
  15
  16 HWY_BEFORE_NAMESPACE();
  17 namespace jxl {
  18 namespace HWY_NAMESPACE {
  19
  20 // These templates are not found via ADL.
  21 using hwy::HWY_NAMESPACE::Add;
  22 using hwy::HWY_NAMESPACE::Mul;
  23 using hwy::HWY_NAMESPACE::Vec;
  24
  25 // Weighted sum of 1x5 pixels around ix, iy with [wx2 wx1 wx0 wx1 wx2].
  26 template <class WrapY>
  27 static float WeightedSumBorder(const ImageF& in, const WrapY wrap_y,
  28                                const int64_t ix, const int64_t iy,
  29                                const size_t xsize, const size_t ysize,
  30                                const float wx0, const float wx1,
  31                                const float wx2) {
  32   const WrapMirror wrap_x;
  33   const float* JXL_RESTRICT row = in.ConstRow(wrap_y(iy, ysize));
  34   const float in_m2 = row[wrap_x(ix - 2, xsize)];
  35   const float in_p2 = row[wrap_x(ix + 2, xsize)];
  36   const float in_m1 = row[wrap_x(ix - 1, xsize)];
  37   const float in_p1 = row[wrap_x(ix + 1, xsize)];
  38   const float in_00 = row[ix];
  39   const float sum_2 = wx2 * (in_m2 + in_p2);
  40   const float sum_1 = wx1 * (in_m1 + in_p1);
  41   const float sum_0 = wx0 * in_00;
  42   return sum_2 + sum_1 + sum_0;
  43 }
  44
  45 template <class WrapY, class V>
  46 static V WeightedSum(const ImageF& in, const WrapY wrap_y, const size_t ix,
  47                      const int64_t iy, const size_t ysize, const V wx0,
  48                      const V wx1, const V wx2) {
  49   const HWY_FULL(float) d;
  50   const float* JXL_RESTRICT center = in.ConstRow(wrap_y(iy, ysize)) + ix;
  51   const auto in_m2 = LoadU(d, center - 2);
  52   const auto in_p2 = LoadU(d, center + 2);
  53   const auto in_m1 = LoadU(d, center - 1);
  54   const auto in_p1 = LoadU(d, center + 1);
  55   const auto in_00 = Load(d, center);
  56   const auto sum_2 = Mul(wx2, Add(in_m2, in_p2));
  57   const auto sum_1 = Mul(wx1, Add(in_m1, in_p1));
  58   const auto sum_0 = Mul(wx0, in_00);
  59   return Add(sum_2, Add(sum_1, sum_0));
  60 }
  61
  62 // Produces result for one pixel
  63 template <class WrapY>
  64 float Symmetric5Border(const ImageF& in, const Rect& rect, const int64_t ix,
  65                        const int64_t iy, const WeightsSymmetric5& weights) {
  66   const float w0 = weights.c[0];
  67   const float w1 = weights.r[0];
  68   const float w2 = weights.R[0];
  69   const float w4 = weights.d[0];
  70   const float w5 = weights.L[0];
  71   const float w8 = weights.D[0];
  72
  73   const size_t xsize = rect.xsize();
  74   const size_t ysize = rect.ysize();
  75   const WrapY wrap_y;
  76   // Unrolled loop over all 5 rows of the kernel.
  77   float sum0 = WeightedSumBorder(in, wrap_y, ix, iy, xsize, ysize, w0, w1, w2);
  78
  79   sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 2, xsize, ysize, w2, w5, w8);
  80   float sum1 =
  81       WeightedSumBorder(in, wrap_y, ix, iy + 2, xsize, ysize, w2, w5, w8);
  82
  83   sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 1, xsize, ysize, w1, w4, w5);
  84   sum1 += WeightedSumBorder(in, wrap_y, ix, iy + 1, xsize, ysize, w1, w4, w5);
  85
  86   return sum0 + sum1;
  87 }
  88
  89 // Produces result for one vector's worth of pixels
  90 template <class WrapY>
  91 static void Symmetric5Interior(const ImageF& in, const Rect& rect,
  92                                const int64_t ix, const int64_t iy,
  93                                const WeightsSymmetric5& weights,
  94                                float* JXL_RESTRICT row_out) {
  95   const HWY_FULL(float) d;
  96
  97   const auto w0 = LoadDup128(d, weights.c);
  98   const auto w1 = LoadDup128(d, weights.r);
  99   const auto w2 = LoadDup128(d, weights.R);
 100   const auto w4 = LoadDup128(d, weights.d);
 101   const auto w5 = LoadDup128(d, weights.L);
 102   const auto w8 = LoadDup128(d, weights.D);
 103
 104   const size_t ysize = rect.ysize();
 105   const WrapY wrap_y;
 106   // Unrolled loop over all 5 rows of the kernel.
 107   auto sum0 = WeightedSum(in, wrap_y, ix, iy, ysize, w0, w1, w2);
 108
 109   sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 2, ysize, w2, w5, w8));
 110   auto sum1 = WeightedSum(in, wrap_y, ix, iy + 2, ysize, w2, w5, w8);
 111
 112   sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 1, ysize, w1, w4, w5));
 113   sum1 = Add(sum1, WeightedSum(in, wrap_y, ix, iy + 1, ysize, w1, w4, w5));
 114
 115   Store(Add(sum0, sum1), d, row_out + ix);
 116 }
 117
 118 template <class WrapY>
 119 static void Symmetric5Row(const ImageF& in, const Rect& rect, const int64_t iy,
 120                           const WeightsSymmetric5& weights,
 121                           float* JXL_RESTRICT row_out) {
 122   const int64_t kRadius = 2;
 123   const size_t xsize = rect.xsize();
 124
 125   size_t ix = 0;
 126   const HWY_FULL(float) d;
 127   const size_t N = Lanes(d);
 128   const size_t aligned_x = RoundUpTo(kRadius, N);
 129   for (; ix < std::min(aligned_x, xsize); ++ix) {
 130     row_out[ix] = Symmetric5Border<WrapY>(in, rect, ix, iy, weights);
 131   }
 132   for (; ix + N + kRadius <= xsize; ix += N) {
 133     Symmetric5Interior<WrapY>(in, rect, ix, iy, weights, row_out);
 134   }
 135   for (; ix < xsize; ++ix) {
 136     row_out[ix] = Symmetric5Border<WrapY>(in, rect, ix, iy, weights);
 137   }
 138 }
 139
 140 static JXL_NOINLINE void Symmetric5BorderRow(const ImageF& in, const Rect& rect,
 141                                              const int64_t iy,
 142                                              const WeightsSymmetric5& weights,
 143                                              float* JXL_RESTRICT row_out) {
 144   return Symmetric5Row<WrapMirror>(in, rect, iy, weights, row_out);
 145 }
 146
 147 // Semi-vectorized (interior pixels Fonly); called directly like slow::, unlike
 148 // the fully vectorized strategies below.
 149 void Symmetric5(const ImageF& in, const Rect& rect,
 150                 const WeightsSymmetric5& weights, ThreadPool* pool,
 151                 ImageF* JXL_RESTRICT out) {
 152   const size_t ysize = rect.ysize();
 153   JXL_CHECK(RunOnPool(
 154       pool, 0, static_cast<uint32_t>(ysize), ThreadPool::NoInit,
 155       [&](const uint32_t task, size_t /*thread*/) {
 156         const int64_t iy = task;
 157
 158         if (iy < 2 || iy >= static_cast<ssize_t>(ysize) - 2) {
 159           Symmetric5BorderRow(in, rect, iy, weights, out->Row(iy));
 160         } else {
 161           Symmetric5Row<WrapUnchanged>(in, rect, iy, weights, out->Row(iy));
 162         }
 163       },
 164       "Symmetric5x5Convolution"));
 165 }
 166
 167 // NOLINTNEXTLINE(google-readability-namespace-comments)
 168 }  // namespace HWY_NAMESPACE
 169 }  // namespace jxl
 170 HWY_AFTER_NAMESPACE();
 171
 172 #if HWY_ONCE
 173 namespace jxl {
 174
 175 HWY_EXPORT(Symmetric5);
 176 void Symmetric5(const ImageF& in, const Rect& rect,
 177                 const WeightsSymmetric5& weights, ThreadPool* pool,
 178                 ImageF* JXL_RESTRICT out) {
 179   return HWY_DYNAMIC_DISPATCH(Symmetric5)(in, rect, weights, pool, out);
 180 }
 181
 182 }  // namespace jxl
 183 #endif  // HWY_ONCE