lib/jpegli/render.cc

   1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
   2 //
   3 // Use of this source code is governed by a BSD-style
   4 // license that can be found in the LICENSE file.
   5
   6 #include "lib/jpegli/render.h"
   7
   8 #include <string.h>
   9
  10 #include <array>
  11 #include <atomic>
  12 #include <cmath>
  13 #include <cstddef>
  14 #include <cstdint>
  15 #include <hwy/aligned_allocator.h>
  16 #include <vector>
  17
  18 #include "lib/jpegli/color_quantize.h"
  19 #include "lib/jpegli/color_transform.h"
  20 #include "lib/jpegli/decode_internal.h"
  21 #include "lib/jpegli/error.h"
  22 #include "lib/jpegli/idct.h"
  23 #include "lib/jpegli/upsample.h"
  24 #include "lib/jxl/base/byte_order.h"
  25 #include "lib/jxl/base/compiler_specific.h"
  26 #include "lib/jxl/base/status.h"
  27
  28 #ifdef MEMORY_SANITIZER
  29 #define JXL_MEMORY_SANITIZER 1
  30 #elif defined(__has_feature)
  31 #if __has_feature(memory_sanitizer)
  32 #define JXL_MEMORY_SANITIZER 1
  33 #else
  34 #define JXL_MEMORY_SANITIZER 0
  35 #endif
  36 #else
  37 #define JXL_MEMORY_SANITIZER 0
  38 #endif
  39
  40 #if JXL_MEMORY_SANITIZER
  41 #include "sanitizer/msan_interface.h"
  42 #endif
  43
  44 #undef HWY_TARGET_INCLUDE
  45 #define HWY_TARGET_INCLUDE "lib/jpegli/render.cc"
  46 #include <hwy/foreach_target.h>
  47 #include <hwy/highway.h>
  48
  49 HWY_BEFORE_NAMESPACE();
  50 namespace jpegli {
  51 namespace HWY_NAMESPACE {
  52
  53 // These templates are not found via ADL.
  54 using hwy::HWY_NAMESPACE::Abs;
  55 using hwy::HWY_NAMESPACE::Add;
  56 using hwy::HWY_NAMESPACE::Clamp;
  57 using hwy::HWY_NAMESPACE::Gt;
  58 using hwy::HWY_NAMESPACE::IfThenElseZero;
  59 using hwy::HWY_NAMESPACE::Mul;
  60 using hwy::HWY_NAMESPACE::NearestInt;
  61 using hwy::HWY_NAMESPACE::Or;
  62 using hwy::HWY_NAMESPACE::Rebind;
  63 using hwy::HWY_NAMESPACE::ShiftLeftSame;
  64 using hwy::HWY_NAMESPACE::ShiftRightSame;
  65 using hwy::HWY_NAMESPACE::Vec;
  66 using D = HWY_FULL(float);
  67 using DI = HWY_FULL(int32_t);
  68 constexpr D d;
  69 constexpr DI di;
  70
  71 void GatherBlockStats(const int16_t* JXL_RESTRICT coeffs,
  72                       const size_t coeffs_size, int32_t* JXL_RESTRICT nonzeros,
  73                       int32_t* JXL_RESTRICT sumabs) {
  74   for (size_t i = 0; i < coeffs_size; i += Lanes(d)) {
  75     size_t k = i % DCTSIZE2;
  76     const Rebind<int16_t, DI> di16;
  77     const Vec<DI> coeff = PromoteTo(di, Load(di16, coeffs + i));
  78     const auto abs_coeff = Abs(coeff);
  79     const auto not_0 = Gt(abs_coeff, Zero(di));
  80     const auto nzero = IfThenElseZero(not_0, Set(di, 1));
  81     Store(Add(nzero, Load(di, nonzeros + k)), di, nonzeros + k);
  82     Store(Add(abs_coeff, Load(di, sumabs + k)), di, sumabs + k);
  83   }
  84 }
  85
  86 void DecenterRow(float* row, size_t xsize) {
  87   const HWY_CAPPED(float, 8) df;
  88   const auto c128 = Set(df, 128.0f / 255);
  89   for (size_t x = 0; x < xsize; x += Lanes(df)) {
  90     Store(Add(Load(df, row + x), c128), df, row + x);
  91   }
  92 }
  93
  94 void DitherRow(j_decompress_ptr cinfo, float* row, int c, size_t y,
  95                size_t xsize) {
  96   jpeg_decomp_master* m = cinfo->master;
  97   if (!m->dither_[c]) return;
  98   const float* dither_row =
  99       &m->dither_[c][(y & m->dither_mask_) * m->dither_size_];
 100   for (size_t x = 0; x < xsize; ++x) {
 101     row[x] += dither_row[x & m->dither_mask_];
 102   }
 103 }
 104
 105 template <typename T>
 106 void StoreUnsignedRow(float* JXL_RESTRICT input[], size_t x0, size_t len,
 107                       size_t num_channels, float multiplier, T* output) {
 108   const HWY_CAPPED(float, 8) d;
 109   auto zero = Zero(d);
 110   auto mul = Set(d, multiplier);
 111   const Rebind<T, decltype(d)> du;
 112 #if JXL_MEMORY_SANITIZER
 113   const size_t padding = hwy::RoundUpTo(len, Lanes(d)) - len;
 114   for (size_t c = 0; c < num_channels; ++c) {
 115     __msan_unpoison(input[c] + x0 + len, sizeof(input[c][0]) * padding);
 116   }
 117 #endif
 118   if (num_channels == 1) {
 119     for (size_t i = 0; i < len; i += Lanes(d)) {
 120       auto v0 = Clamp(zero, Mul(LoadU(d, &input[0][x0 + i]), mul), mul);
 121       StoreU(DemoteTo(du, NearestInt(v0)), du, &output[i]);
 122     }
 123   } else if (num_channels == 2) {
 124     for (size_t i = 0; i < len; i += Lanes(d)) {
 125       auto v0 = Clamp(zero, Mul(LoadU(d, &input[0][x0 + i]), mul), mul);
 126       auto v1 = Clamp(zero, Mul(LoadU(d, &input[1][x0 + i]), mul), mul);
 127       StoreInterleaved2(DemoteTo(du, NearestInt(v0)),
 128                         DemoteTo(du, NearestInt(v1)), du, &output[2 * i]);
 129     }
 130   } else if (num_channels == 3) {
 131     for (size_t i = 0; i < len; i += Lanes(d)) {
 132       auto v0 = Clamp(zero, Mul(LoadU(d, &input[0][x0 + i]), mul), mul);
 133       auto v1 = Clamp(zero, Mul(LoadU(d, &input[1][x0 + i]), mul), mul);
 134       auto v2 = Clamp(zero, Mul(LoadU(d, &input[2][x0 + i]), mul), mul);
 135       StoreInterleaved3(DemoteTo(du, NearestInt(v0)),
 136                         DemoteTo(du, NearestInt(v1)),
 137                         DemoteTo(du, NearestInt(v2)), du, &output[3 * i]);
 138     }
 139   } else if (num_channels == 4) {
 140     for (size_t i = 0; i < len; i += Lanes(d)) {
 141       auto v0 = Clamp(zero, Mul(LoadU(d, &input[0][x0 + i]), mul), mul);
 142       auto v1 = Clamp(zero, Mul(LoadU(d, &input[1][x0 + i]), mul), mul);
 143       auto v2 = Clamp(zero, Mul(LoadU(d, &input[2][x0 + i]), mul), mul);
 144       auto v3 = Clamp(zero, Mul(LoadU(d, &input[3][x0 + i]), mul), mul);
 145       StoreInterleaved4(DemoteTo(du, NearestInt(v0)),
 146                         DemoteTo(du, NearestInt(v1)),
 147                         DemoteTo(du, NearestInt(v2)),
 148                         DemoteTo(du, NearestInt(v3)), du, &output[4 * i]);
 149     }
 150   }
 151 #if JXL_MEMORY_SANITIZER
 152   __msan_poison(output + num_channels * len,
 153                 sizeof(output[0]) * num_channels * padding);
 154 #endif
 155 }
 156
 157 void StoreFloatRow(float* JXL_RESTRICT input[3], size_t x0, size_t len,
 158                    size_t num_channels, float* output) {
 159   const HWY_CAPPED(float, 8) d;
 160   if (num_channels == 1) {
 161     memcpy(output, input[0] + x0, len * sizeof(output[0]));
 162   } else if (num_channels == 2) {
 163     for (size_t i = 0; i < len; i += Lanes(d)) {
 164       StoreInterleaved2(LoadU(d, &input[0][x0 + i]),
 165                         LoadU(d, &input[1][x0 + i]), d, &output[2 * i]);
 166     }
 167   } else if (num_channels == 3) {
 168     for (size_t i = 0; i < len; i += Lanes(d)) {
 169       StoreInterleaved3(LoadU(d, &input[0][x0 + i]),
 170                         LoadU(d, &input[1][x0 + i]),
 171                         LoadU(d, &input[2][x0 + i]), d, &output[3 * i]);
 172     }
 173   } else if (num_channels == 4) {
 174     for (size_t i = 0; i < len; i += Lanes(d)) {
 175       StoreInterleaved4(LoadU(d, &input[0][x0 + i]),
 176                         LoadU(d, &input[1][x0 + i]),
 177                         LoadU(d, &input[2][x0 + i]),
 178                         LoadU(d, &input[3][x0 + i]), d, &output[4 * i]);
 179     }
 180   }
 181 }
 182
 183 static constexpr float kFSWeightMR = 7.0f / 16.0f;
 184 static constexpr float kFSWeightBL = 3.0f / 16.0f;
 185 static constexpr float kFSWeightBM = 5.0f / 16.0f;
 186 static constexpr float kFSWeightBR = 1.0f / 16.0f;
 187
 188 float LimitError(float error) {
 189   float abserror = std::abs(error);
 190   if (abserror > 48.0f) {
 191     abserror = 32.0f;
 192   } else if (abserror > 16.0f) {
 193     abserror = 0.5f * abserror + 8.0f;
 194   }
 195   return error > 0.0f ? abserror : -abserror;
 196 }
 197
 198 void WriteToOutput(j_decompress_ptr cinfo, float* JXL_RESTRICT rows[],
 199                    size_t xoffset, size_t len, size_t num_channels,
 200                    uint8_t* JXL_RESTRICT output) {
 201   jpeg_decomp_master* m = cinfo->master;
 202   uint8_t* JXL_RESTRICT scratch_space = m->output_scratch_;
 203   if (cinfo->quantize_colors && m->quant_pass_ == 1) {
 204     float* error_row[kMaxComponents];
 205     float* next_error_row[kMaxComponents];
 206     if (cinfo->dither_mode == JDITHER_ORDERED) {
 207       for (size_t c = 0; c < num_channels; ++c) {
 208         DitherRow(cinfo, &rows[c][xoffset], c, cinfo->output_scanline,
 209                   cinfo->output_width);
 210       }
 211     } else if (cinfo->dither_mode == JDITHER_FS) {
 212       for (size_t c = 0; c < num_channels; ++c) {
 213         if (cinfo->output_scanline % 2 == 0) {
 214           error_row[c] = m->error_row_[c];
 215           next_error_row[c] = m->error_row_[c + kMaxComponents];
 216         } else {
 217           error_row[c] = m->error_row_[c + kMaxComponents];
 218           next_error_row[c] = m->error_row_[c];
 219         }
 220         memset(next_error_row[c], 0.0, cinfo->output_width * sizeof(float));
 221       }
 222     }
 223     const float mul = 255.0f;
 224     if (cinfo->dither_mode != JDITHER_FS) {
 225       StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space);
 226     }
 227     for (size_t i = 0; i < len; ++i) {
 228       uint8_t* pixel = &scratch_space[num_channels * i];
 229       if (cinfo->dither_mode == JDITHER_FS) {
 230         for (size_t c = 0; c < num_channels; ++c) {
 231           float val = rows[c][i] * mul + LimitError(error_row[c][i]);
 232           pixel[c] = std::round(std::min(255.0f, std::max(0.0f, val)));
 233         }
 234       }
 235       int index = LookupColorIndex(cinfo, pixel);
 236       output[i] = index;
 237       if (cinfo->dither_mode == JDITHER_FS) {
 238         size_t prev_i = i > 0 ? i - 1 : 0;
 239         size_t next_i = i + 1 < len ? i + 1 : len - 1;
 240         for (size_t c = 0; c < num_channels; ++c) {
 241           float error = pixel[c] - cinfo->colormap[c][index];
 242           error_row[c][next_i] += kFSWeightMR * error;
 243           next_error_row[c][prev_i] += kFSWeightBL * error;
 244           next_error_row[c][i] += kFSWeightBM * error;
 245           next_error_row[c][next_i] += kFSWeightBR * error;
 246         }
 247       }
 248     }
 249   } else if (m->output_data_type_ == JPEGLI_TYPE_UINT8) {
 250     const float mul = 255.0;
 251     StoreUnsignedRow(rows, xoffset, len, num_channels, mul, scratch_space);
 252     memcpy(output, scratch_space, len * num_channels);
 253   } else if (m->output_data_type_ == JPEGLI_TYPE_UINT16) {
 254     const float mul = 65535.0;
 255     uint16_t* tmp = reinterpret_cast<uint16_t*>(scratch_space);
 256     StoreUnsignedRow(rows, xoffset, len, num_channels, mul, tmp);
 257     if (m->swap_endianness_) {
 258       const HWY_CAPPED(uint16_t, 8) du;
 259       size_t output_len = len * num_channels;
 260       for (size_t j = 0; j < output_len; j += Lanes(du)) {
 261         auto v = LoadU(du, tmp + j);
 262         auto vswap = Or(ShiftRightSame(v, 8), ShiftLeftSame(v, 8));
 263         StoreU(vswap, du, tmp + j);
 264       }
 265     }
 266     memcpy(output, tmp, len * num_channels * 2);
 267   } else if (m->output_data_type_ == JPEGLI_TYPE_FLOAT) {
 268     float* tmp = reinterpret_cast<float*>(scratch_space);
 269     StoreFloatRow(rows, xoffset, len, num_channels, tmp);
 270     if (m->swap_endianness_) {
 271       size_t output_len = len * num_channels;
 272       for (size_t j = 0; j < output_len; ++j) {
 273         tmp[j] = BSwapFloat(tmp[j]);
 274       }
 275     }
 276     memcpy(output, tmp, len * num_channels * 4);
 277   }
 278 }
 279
 280 // NOLINTNEXTLINE(google-readability-namespace-comments)
 281 }  // namespace HWY_NAMESPACE
 282 }  // namespace jpegli
 283 HWY_AFTER_NAMESPACE();
 284
 285 #if HWY_ONCE
 286
 287 namespace jpegli {
 288
 289 HWY_EXPORT(GatherBlockStats);
 290 HWY_EXPORT(WriteToOutput);
 291 HWY_EXPORT(DecenterRow);
 292
 293 void GatherBlockStats(const int16_t* JXL_RESTRICT coeffs,
 294                       const size_t coeffs_size, int32_t* JXL_RESTRICT nonzeros,
 295                       int32_t* JXL_RESTRICT sumabs) {
 296   return HWY_DYNAMIC_DISPATCH(GatherBlockStats)(coeffs, coeffs_size, nonzeros,
 297                                                 sumabs);
 298 }
 299
 300 void WriteToOutput(j_decompress_ptr cinfo, float* JXL_RESTRICT rows[],
 301                    size_t xoffset, size_t len, size_t num_channels,
 302                    uint8_t* JXL_RESTRICT output) {
 303   return HWY_DYNAMIC_DISPATCH(WriteToOutput)(cinfo, rows, xoffset, len,
 304                                              num_channels, output);
 305 }
 306
 307 void DecenterRow(float* row, size_t xsize) {
 308   return HWY_DYNAMIC_DISPATCH(DecenterRow)(row, xsize);
 309 }
 310
 311 bool ShouldApplyDequantBiases(j_decompress_ptr cinfo, int ci) {
 312   const auto& compinfo = cinfo->comp_info[ci];
 313   return (compinfo.h_samp_factor == cinfo->max_h_samp_factor &&
 314           compinfo.v_samp_factor == cinfo->max_v_samp_factor);
 315 }
 316
 317 // See the following article for the details:
 318 // J. R. Price and M. Rabbani, "Dequantization bias for JPEG decompression"
 319 // Proceedings International Conference on Information Technology: Coding and
 320 // Computing (Cat. No.PR00540), 2000, pp. 30-35, doi: 10.1109/ITCC.2000.844179.
 321 void ComputeOptimalLaplacianBiases(const int num_blocks, const int* nonzeros,
 322                                    const int* sumabs, float* biases) {
 323   for (size_t k = 1; k < DCTSIZE2; ++k) {
 324     if (nonzeros[k] == 0) {
 325       biases[k] = 0.5f;
 326       continue;
 327     }
 328     // Notation adapted from the article
 329     float N = num_blocks;
 330     float N1 = nonzeros[k];
 331     float N0 = num_blocks - N1;
 332     float S = sumabs[k];
 333     // Compute gamma from N0, N1, N, S (eq. 11), with A and B being just
 334     // temporary grouping of terms.
 335     float A = 4.0 * S + 2.0 * N;
 336     float B = 4.0 * S - 2.0 * N1;
 337     float gamma = (-1.0 * N0 + std::sqrt(N0 * N0 * 1.0 + A * B)) / A;
 338     float gamma2 = gamma * gamma;
 339     // The bias is computed from gamma with (eq. 5), where the quantization
 340     // multiplier Q can be factored out and thus the bias can be applied
 341     // directly on the quantized coefficient.
 342     biases[k] =
 343         0.5 * (((1.0 + gamma2) / (1.0 - gamma2)) + 1.0 / std::log(gamma));
 344   }
 345 }
 346
 347 constexpr std::array<int, SAVED_COEFS> Q_POS = {0, 1, 8,  16, 9,
 348                                                 2, 3, 10, 17, 24};
 349
 350 bool is_nonzero_quantizers(const JQUANT_TBL* qtable) {
 351   return std::all_of(Q_POS.begin(), Q_POS.end(),
 352                      [&](int pos) { return qtable->quantval[pos] != 0; });
 353 }
 354
 355 // Determine whether smoothing should be applied during decompression
 356 bool do_smoothing(j_decompress_ptr cinfo) {
 357   jpeg_decomp_master* m = cinfo->master;
 358   bool smoothing_useful = false;
 359
 360   if (!cinfo->progressive_mode || cinfo->coef_bits == nullptr) {
 361     return false;
 362   }
 363   auto coef_bits_latch = m->coef_bits_latch;
 364   auto prev_coef_bits_latch = m->prev_coef_bits_latch;
 365
 366   for (int ci = 0; ci < cinfo->num_components; ci++) {
 367     jpeg_component_info* compptr = &cinfo->comp_info[ci];
 368     JQUANT_TBL* qtable = compptr->quant_table;
 369     int* coef_bits = cinfo->coef_bits[ci];
 370     int* prev_coef_bits = cinfo->coef_bits[ci + cinfo->num_components];
 371
 372     // Return early if conditions for smoothing are not met
 373     if (qtable == nullptr || !is_nonzero_quantizers(qtable) ||
 374         coef_bits[0] < 0) {
 375       return false;
 376     }
 377
 378     coef_bits_latch[ci][0] = coef_bits[0];
 379
 380     for (int coefi = 1; coefi < SAVED_COEFS; coefi++) {
 381       prev_coef_bits_latch[ci][coefi] =
 382           cinfo->input_scan_number > 1 ? prev_coef_bits[coefi] : -1;
 383       if (coef_bits[coefi] != 0) {
 384         smoothing_useful = true;
 385       }
 386       coef_bits_latch[ci][coefi] = coef_bits[coefi];
 387     }
 388   }
 389
 390   return smoothing_useful;
 391 }
 392
 393 void PredictSmooth(j_decompress_ptr cinfo, JBLOCKARRAY blocks, int component,
 394                    size_t bx, int iy) {
 395   const size_t imcu_row = cinfo->output_iMCU_row;
 396   int16_t* scratch = cinfo->master->smoothing_scratch_;
 397   std::vector<int> Q_VAL(SAVED_COEFS);
 398   int* coef_bits;
 399
 400   std::array<std::array<int, 5>, 5> dc_values;
 401   auto& compinfo = cinfo->comp_info[component];
 402   const size_t by0 = imcu_row * compinfo.v_samp_factor;
 403   const size_t by = by0 + iy;
 404
 405   int prev_iy = by > 0 ? iy - 1 : 0;
 406   int prev_prev_iy = by > 1 ? iy - 2 : prev_iy;
 407   int next_iy = by + 1 < compinfo.height_in_blocks ? iy + 1 : iy;
 408   int next_next_iy = by + 2 < compinfo.height_in_blocks ? iy + 2 : next_iy;
 409
 410   const int16_t* cur_row = blocks[iy][bx];
 411   const int16_t* prev_row = blocks[prev_iy][bx];
 412   const int16_t* prev_prev_row = blocks[prev_prev_iy][bx];
 413   const int16_t* next_row = blocks[next_iy][bx];
 414   const int16_t* next_next_row = blocks[next_next_iy][bx];
 415
 416   int prev_block_ind = bx ? -DCTSIZE2 : 0;
 417   int prev_prev_block_ind = bx > 1 ? -2 * DCTSIZE2 : prev_block_ind;
 418   int next_block_ind = bx + 1 < compinfo.width_in_blocks ? DCTSIZE2 : 0;
 419   int next_next_block_ind =
 420       bx + 2 < compinfo.width_in_blocks ? DCTSIZE2 * 2 : next_block_ind;
 421
 422   std::array<const int16_t*, 5> row_ptrs = {prev_prev_row, prev_row, cur_row,
 423                                             next_row, next_next_row};
 424   std::array<int, 5> block_inds = {prev_prev_block_ind, prev_block_ind, 0,
 425                                    next_block_ind, next_next_block_ind};
 426
 427   memcpy(scratch, cur_row, DCTSIZE2 * sizeof(cur_row[0]));
 428
 429   for (int r = 0; r < 5; ++r) {
 430     for (int c = 0; c < 5; ++c) {
 431       dc_values[r][c] = row_ptrs[r][block_inds[c]];
 432     }
 433   }
 434   // Get the correct coef_bits: In case of an incomplete scan, we use the
 435   // prev coeficients.
 436   if (cinfo->output_iMCU_row + 1 > cinfo->input_iMCU_row) {
 437     coef_bits = cinfo->master->prev_coef_bits_latch[component];
 438   } else {
 439     coef_bits = cinfo->master->coef_bits_latch[component];
 440   }
 441
 442   bool change_dc = true;
 443   for (int i = 1; i < SAVED_COEFS; i++) {
 444     if (coef_bits[i] != -1) {
 445       change_dc = false;
 446       break;
 447     }
 448   }
 449
 450   JQUANT_TBL* quanttbl = cinfo->quant_tbl_ptrs[compinfo.quant_tbl_no];
 451   for (size_t i = 0; i < 6; ++i) {
 452     Q_VAL[i] = quanttbl->quantval[Q_POS[i]];
 453   }
 454   if (change_dc) {
 455     for (size_t i = 6; i < SAVED_COEFS; ++i) {
 456       Q_VAL[i] = quanttbl->quantval[Q_POS[i]];
 457     }
 458   }
 459   auto calculate_dct_value = [&](int coef_index) {
 460     int64_t num = 0;
 461     int pred;
 462     int Al;
 463     // we use the symmetry of the smoothing matrices by transposing the 5x5 dc
 464     // matrix in that case.
 465     bool swap_indices = coef_index == 2 || coef_index == 5 || coef_index == 8 ||
 466                         coef_index == 9;
 467     auto dc = [&](int i, int j) {
 468       return swap_indices ? dc_values[j][i] : dc_values[i][j];
 469     };
 470     Al = coef_bits[coef_index];
 471     switch (coef_index) {
 472       case 0:
 473         // set the DC
 474         num = (-2 * dc(0, 0) - 6 * dc(0, 1) - 8 * dc(0, 2) - 6 * dc(0, 3) -
 475                2 * dc(0, 4) - 6 * dc(1, 0) + 6 * dc(1, 1) + 42 * dc(1, 2) +
 476                6 * dc(1, 3) - 6 * dc(1, 4) - 8 * dc(2, 0) + 42 * dc(2, 1) +
 477                152 * dc(2, 2) + 42 * dc(2, 3) - 8 * dc(2, 4) - 6 * dc(3, 0) +
 478                6 * dc(3, 1) + 42 * dc(3, 2) + 6 * dc(3, 3) - 6 * dc(3, 4) -
 479                2 * dc(4, 0) - 6 * dc(4, 1) - 8 * dc(4, 2) - 6 * dc(4, 3) -
 480                2 * dc(4, 4));
 481         // special case: for the DC the dequantization is different
 482         Al = 0;
 483         break;
 484       case 1:
 485       case 2:
 486         // set Q01 or Q10
 487         num = (change_dc ? (-dc(0, 0) - dc(0, 1) + dc(0, 3) + dc(0, 4) -
 488                             3 * dc(1, 0) + 13 * dc(1, 1) - 13 * dc(1, 3) +
 489                             3 * dc(1, 4) - 3 * dc(2, 0) + 38 * dc(2, 1) -
 490                             38 * dc(2, 3) + 3 * dc(2, 4) - 3 * dc(3, 0) +
 491                             13 * dc(3, 1) - 13 * dc(3, 3) + 3 * dc(3, 4) -
 492                             dc(4, 0) - dc(4, 1) + dc(4, 3) + dc(4, 4))
 493                          : (-7 * dc(2, 0) + 50 * dc(2, 1) - 50 * dc(2, 3) +
 494                             7 * dc(2, 4)));
 495         break;
 496       case 3:
 497       case 5:
 498         // set Q02 or Q20
 499         num = (change_dc
 500                    ? dc(0, 2) + 2 * dc(1, 1) + 7 * dc(1, 2) + 2 * dc(1, 3) -
 501                          5 * dc(2, 1) - 14 * dc(2, 2) - 5 * dc(2, 3) +
 502                          2 * dc(3, 1) + 7 * dc(3, 2) + 2 * dc(3, 3) + dc(4, 2)
 503                    : (-dc(0, 2) + 13 * dc(1, 2) - 24 * dc(2, 2) +
 504                       13 * dc(3, 2) - dc(4, 2)));
 505         break;
 506       case 4:
 507         // set Q11
 508         num =
 509             (change_dc ? -dc(0, 0) + dc(0, 4) + 9 * dc(1, 1) - 9 * dc(1, 3) -
 510                              9 * dc(3, 1) + 9 * dc(3, 3) + dc(4, 0) - dc(4, 4)
 511                        : (dc(1, 4) + dc(3, 0) - 10 * dc(3, 1) + 10 * dc(3, 3) -
 512                           dc(0, 1) - dc(3, 4) + dc(4, 1) - dc(4, 3) + dc(0, 3) -
 513                           dc(1, 0) + 10 * dc(1, 1) - 10 * dc(1, 3)));
 514         break;
 515       case 6:
 516       case 9:
 517         // set Q03 or Q30
 518         num = (dc(1, 1) - dc(1, 3) + 2 * dc(2, 1) - 2 * dc(2, 3) + dc(3, 1) -
 519                dc(3, 3));
 520         break;
 521       case 7:
 522       case 8:
 523         // set Q12 and Q21
 524         num = (dc(1, 1) - 3 * dc(1, 2) + dc(1, 3) - dc(3, 1) + 3 * dc(3, 2) -
 525                dc(3, 3));
 526         break;
 527     }
 528     num = Q_VAL[0] * num;
 529     if (num >= 0) {
 530       pred = ((Q_VAL[coef_index] << 7) + num) / (Q_VAL[coef_index] << 8);
 531       if (Al > 0 && pred >= (1 << Al)) pred = (1 << Al) - 1;
 532     } else {
 533       pred = ((Q_VAL[coef_index] << 7) - num) / (Q_VAL[coef_index] << 8);
 534       if (Al > 0 && pred >= (1 << Al)) pred = (1 << Al) - 1;
 535       pred = -pred;
 536     }
 537     return static_cast<int16_t>(pred);
 538   };
 539
 540   int loop_end = change_dc ? SAVED_COEFS : 6;
 541   for (int i = 1; i < loop_end; ++i) {
 542     if (coef_bits[i] != 0 && scratch[Q_POS[i]] == 0) {
 543       scratch[Q_POS[i]] = calculate_dct_value(i);
 544     }
 545   }
 546   if (change_dc) {
 547     scratch[0] = calculate_dct_value(0);
 548   }
 549 }
 550
 551 void PrepareForOutput(j_decompress_ptr cinfo) {
 552   jpeg_decomp_master* m = cinfo->master;
 553   bool smoothing = do_smoothing(cinfo);
 554   m->apply_smoothing = smoothing && cinfo->do_block_smoothing;
 555   size_t coeffs_per_block = cinfo->num_components * DCTSIZE2;
 556   memset(m->nonzeros_, 0, coeffs_per_block * sizeof(m->nonzeros_[0]));
 557   memset(m->sumabs_, 0, coeffs_per_block * sizeof(m->sumabs_[0]));
 558   memset(m->num_processed_blocks_, 0, sizeof(m->num_processed_blocks_));
 559   memset(m->biases_, 0, coeffs_per_block * sizeof(m->biases_[0]));
 560   cinfo->output_iMCU_row = 0;
 561   cinfo->output_scanline = 0;
 562   const float kDequantScale = 1.0f / (8 * 255);
 563   for (int c = 0; c < cinfo->num_components; c++) {
 564     const auto& comp = cinfo->comp_info[c];
 565     JQUANT_TBL* table = comp.quant_table;
 566     if (table == nullptr) continue;
 567     for (size_t k = 0; k < DCTSIZE2; ++k) {
 568       m->dequant_[c * DCTSIZE2 + k] = table->quantval[k] * kDequantScale;
 569     }
 570   }
 571   ChooseInverseTransform(cinfo);
 572   ChooseColorTransform(cinfo);
 573 }
 574
 575 void DecodeCurrentiMCURow(j_decompress_ptr cinfo) {
 576   jpeg_decomp_master* m = cinfo->master;
 577   const size_t imcu_row = cinfo->output_iMCU_row;
 578   JBLOCKARRAY ba[kMaxComponents];
 579   for (int c = 0; c < cinfo->num_components; ++c) {
 580     const jpeg_component_info* comp = &cinfo->comp_info[c];
 581     int by0 = imcu_row * comp->v_samp_factor;
 582     int block_rows_left = comp->height_in_blocks - by0;
 583     int max_block_rows = std::min(comp->v_samp_factor, block_rows_left);
 584     int offset = m->streaming_mode_ ? 0 : by0;
 585     ba[c] = (*cinfo->mem->access_virt_barray)(
 586         reinterpret_cast<j_common_ptr>(cinfo), m->coef_arrays[c], offset,
 587         max_block_rows, false);
 588   }
 589   for (int c = 0; c < cinfo->num_components; ++c) {
 590     size_t k0 = c * DCTSIZE2;
 591     auto& compinfo = cinfo->comp_info[c];
 592     size_t block_row = imcu_row * compinfo.v_samp_factor;
 593     if (ShouldApplyDequantBiases(cinfo, c)) {
 594       // Update statistics for this iMCU row.
 595       for (int iy = 0; iy < compinfo.v_samp_factor; ++iy) {
 596         size_t by = block_row + iy;
 597         if (by >= compinfo.height_in_blocks) {
 598           continue;
 599         }
 600         int16_t* JXL_RESTRICT coeffs = &ba[c][iy][0][0];
 601         size_t num = compinfo.width_in_blocks * DCTSIZE2;
 602         GatherBlockStats(coeffs, num, &m->nonzeros_[k0], &m->sumabs_[k0]);
 603         m->num_processed_blocks_[c] += compinfo.width_in_blocks;
 604       }
 605       if (imcu_row % 4 == 3) {
 606         // Re-compute optimal biases every few iMCU-rows.
 607         ComputeOptimalLaplacianBiases(m->num_processed_blocks_[c],
 608                                       &m->nonzeros_[k0], &m->sumabs_[k0],
 609                                       &m->biases_[k0]);
 610       }
 611     }
 612     RowBuffer<float>* raw_out = &m->raw_output_[c];
 613     for (int iy = 0; iy < compinfo.v_samp_factor; ++iy) {
 614       size_t by = block_row + iy;
 615       if (by >= compinfo.height_in_blocks) {
 616         continue;
 617       }
 618       size_t dctsize = m->scaled_dct_size[c];
 619       int16_t* JXL_RESTRICT row_in = &ba[c][iy][0][0];
 620       float* JXL_RESTRICT row_out = raw_out->Row(by * dctsize);
 621       for (size_t bx = 0; bx < compinfo.width_in_blocks; ++bx) {
 622         if (m->apply_smoothing) {
 623           PredictSmooth(cinfo, ba[c], c, bx, iy);
 624           (*m->inverse_transform[c])(m->smoothing_scratch_, &m->dequant_[k0],
 625                                      &m->biases_[k0], m->idct_scratch_,
 626                                      &row_out[bx * dctsize], raw_out->stride(),
 627                                      dctsize);
 628         } else {
 629           (*m->inverse_transform[c])(&row_in[bx * DCTSIZE2], &m->dequant_[k0],
 630                                      &m->biases_[k0], m->idct_scratch_,
 631                                      &row_out[bx * dctsize], raw_out->stride(),
 632                                      dctsize);
 633         }
 634       }
 635       if (m->streaming_mode_) {
 636         memset(row_in, 0, compinfo.width_in_blocks * sizeof(JBLOCK));
 637       }
 638     }
 639   }
 640 }
 641
 642 void ProcessRawOutput(j_decompress_ptr cinfo, JSAMPIMAGE data) {
 643   jpegli::DecodeCurrentiMCURow(cinfo);
 644   jpeg_decomp_master* m = cinfo->master;
 645   for (int c = 0; c < cinfo->num_components; ++c) {
 646     const auto& compinfo = cinfo->comp_info[c];
 647     size_t comp_width = compinfo.width_in_blocks * DCTSIZE;
 648     size_t comp_height = compinfo.height_in_blocks * DCTSIZE;
 649     size_t comp_nrows = compinfo.v_samp_factor * DCTSIZE;
 650     size_t y0 = cinfo->output_iMCU_row * compinfo.v_samp_factor * DCTSIZE;
 651     size_t y1 = std::min(y0 + comp_nrows, comp_height);
 652     for (size_t y = y0; y < y1; ++y) {
 653       float* rows[1] = {m->raw_output_[c].Row(y)};
 654       uint8_t* output = data[c][y - y0];
 655       DecenterRow(rows[0], comp_width);
 656       WriteToOutput(cinfo, rows, 0, comp_width, 1, output);
 657     }
 658   }
 659   ++cinfo->output_iMCU_row;
 660   cinfo->output_scanline += cinfo->max_v_samp_factor * DCTSIZE;
 661   if (cinfo->output_scanline >= cinfo->output_height) {
 662     ++m->output_passes_done_;
 663   }
 664 }
 665
 666 void ProcessOutput(j_decompress_ptr cinfo, size_t* num_output_rows,
 667                    JSAMPARRAY scanlines, size_t max_output_rows) {
 668   jpeg_decomp_master* m = cinfo->master;
 669   const int vfactor = cinfo->max_v_samp_factor;
 670   const int hfactor = cinfo->max_h_samp_factor;
 671   const size_t context = m->need_context_rows_ ? 1 : 0;
 672   const size_t imcu_row = cinfo->output_iMCU_row;
 673   const size_t imcu_height = vfactor * m->min_scaled_dct_size;
 674   const size_t imcu_width = hfactor * m->min_scaled_dct_size;
 675   const size_t output_width = m->iMCU_cols_ * imcu_width;
 676   if (imcu_row == cinfo->total_iMCU_rows ||
 677       (imcu_row > context &&
 678        cinfo->output_scanline < (imcu_row - context) * imcu_height)) {
 679     // We are ready to output some scanlines.
 680     size_t ybegin = cinfo->output_scanline;
 681     size_t yend = (imcu_row == cinfo->total_iMCU_rows
 682                        ? cinfo->output_height
 683                        : (imcu_row - context) * imcu_height);
 684     yend = std::min<size_t>(yend, ybegin + max_output_rows - *num_output_rows);
 685     size_t yb = (ybegin / vfactor) * vfactor;
 686     size_t ye = DivCeil(yend, vfactor) * vfactor;
 687     for (size_t y = yb; y < ye; y += vfactor) {
 688       for (int c = 0; c < cinfo->num_components; ++c) {
 689         RowBuffer<float>* raw_out = &m->raw_output_[c];
 690         RowBuffer<float>* render_out = &m->render_output_[c];
 691         int line_groups = vfactor / m->v_factor[c];
 692         int downsampled_width = output_width / m->h_factor[c];
 693         size_t yc = y / m->v_factor[c];
 694         for (int dy = 0; dy < line_groups; ++dy) {
 695           size_t ymid = yc + dy;
 696           const float* JXL_RESTRICT row_mid = raw_out->Row(ymid);
 697           if (cinfo->do_fancy_upsampling && m->v_factor[c] == 2) {
 698             const float* JXL_RESTRICT row_top =
 699                 ymid == 0 ? row_mid : raw_out->Row(ymid - 1);
 700             const float* JXL_RESTRICT row_bot = ymid + 1 == m->raw_height_[c]
 701                                                     ? row_mid
 702                                                     : raw_out->Row(ymid + 1);
 703             Upsample2Vertical(row_top, row_mid, row_bot,
 704                               render_out->Row(2 * dy),
 705                               render_out->Row(2 * dy + 1), downsampled_width);
 706           } else {
 707             for (int yix = 0; yix < m->v_factor[c]; ++yix) {
 708               memcpy(render_out->Row(m->v_factor[c] * dy + yix), row_mid,
 709                      downsampled_width * sizeof(float));
 710             }
 711           }
 712           if (m->h_factor[c] > 1) {
 713             for (int yix = 0; yix < m->v_factor[c]; ++yix) {
 714               int row_ix = m->v_factor[c] * dy + yix;
 715               float* JXL_RESTRICT row = render_out->Row(row_ix);
 716               float* JXL_RESTRICT tmp = m->upsample_scratch_;
 717               if (cinfo->do_fancy_upsampling && m->h_factor[c] == 2) {
 718                 Upsample2Horizontal(row, tmp, output_width);
 719               } else {
 720                 // TODO(szabadka) SIMDify this.
 721                 for (size_t x = 0; x < output_width; ++x) {
 722                   tmp[x] = row[x / m->h_factor[c]];
 723                 }
 724                 memcpy(row, tmp, output_width * sizeof(tmp[0]));
 725               }
 726             }
 727           }
 728         }
 729       }
 730       for (int yix = 0; yix < vfactor; ++yix) {
 731         if (y + yix < ybegin || y + yix >= yend) continue;
 732         float* rows[kMaxComponents];
 733         int num_all_components =
 734             std::max(cinfo->out_color_components, cinfo->num_components);
 735         for (int c = 0; c < num_all_components; ++c) {
 736           rows[c] = m->render_output_[c].Row(yix);
 737         }
 738         (*m->color_transform)(rows, output_width);
 739         for (int c = 0; c < cinfo->out_color_components; ++c) {
 740           // Undo the centering of the sample values around zero.
 741           DecenterRow(rows[c], output_width);
 742         }
 743         if (scanlines) {
 744           uint8_t* output = scanlines[*num_output_rows];
 745           WriteToOutput(cinfo, rows, m->xoffset_, cinfo->output_width,
 746                         cinfo->out_color_components, output);
 747         }
 748         JXL_ASSERT(cinfo->output_scanline == y + yix);
 749         ++cinfo->output_scanline;
 750         ++(*num_output_rows);
 751         if (cinfo->output_scanline == cinfo->output_height) {
 752           ++m->output_passes_done_;
 753         }
 754       }
 755     }
 756   } else {
 757     DecodeCurrentiMCURow(cinfo);
 758     ++cinfo->output_iMCU_row;
 759   }
 760 }
 761
 762 }  // namespace jpegli
 763 #endif  // HWY_ONCE