Imported Upstream version 0.9.0
[platform/upstream/libjxl.git] / lib / jxl / enc_chroma_from_luma.cc
index 4f0798e..fa0d234 100644 (file)
 #include <hwy/foreach_target.h>
 #include <hwy/highway.h>
 
-#include "lib/jxl/aux_out.h"
 #include "lib/jxl/base/bits.h"
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/common.h"
 #include "lib/jxl/base/span.h"
 #include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/cms/opsin_params.h"
 #include "lib/jxl/dec_transforms-inl.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_params.h"
 #include "lib/jxl/enc_transforms-inl.h"
 #include "lib/jxl/entropy_coder.h"
 #include "lib/jxl/image_ops.h"
 #include "lib/jxl/modular/encoding/encoding.h"
 #include "lib/jxl/quantizer.h"
+#include "lib/jxl/simd_util.h"
 HWY_BEFORE_NAMESPACE();
 namespace jxl {
 namespace HWY_NAMESPACE {
@@ -114,6 +115,7 @@ struct CFLFunction {
   float distance_mul;
 };
 
+// Chroma-from-luma search, values_m will have luma -- and values_s chroma.
 int32_t FindBestMultiplier(const float* values_m, const float* values_s,
                            size_t num, float base, float distance_mul,
                            bool fast) {
@@ -139,7 +141,7 @@ int32_t FindBestMultiplier(const float* values_m, const float* values_s,
     x = -GetLane(SumOfLanes(df, cb)) /
         (GetLane(SumOfLanes(df, ca)) + num * distance_mul * 0.5f);
   } else {
-    constexpr float eps = 1;
+    constexpr float eps = 100;
     constexpr float kClamp = 20.0f;
     CFLFunction fn(values_m, values_s, num, base, distance_mul);
     x = 0;
@@ -150,11 +152,26 @@ int32_t FindBestMultiplier(const float* values_m, const float* values_s,
       float dfpeps, dfmeps;
       float df = fn.Compute(x, eps, &dfpeps, &dfmeps);
       float ddf = (dfpeps - dfmeps) / (2 * eps);
-      float step = df / ddf;
+      float kExperimentalInsignificantStabilizer = 0.85;
+      float step = df / (ddf + kExperimentalInsignificantStabilizer);
       x -= std::min(kClamp, std::max(-kClamp, step));
       if (std::abs(step) < 3e-3) break;
     }
   }
+  // CFL seems to be tricky for larger transforms for HF components
+  // close to zero. This heuristic brings the solutions closer to zero
+  // and reduces red-green oscillations. A better approach would
+  // look into variance of the multiplier within separate (e.g. 8x8)
+  // areas and only apply this heuristic where there is a high variance.
+  // This would give about 1 % more compression density.
+  float towards_zero = 2.6;
+  if (x >= towards_zero) {
+    x -= towards_zero;
+  } else if (x <= -towards_zero) {
+    x += towards_zero;
+  } else {
+    x = 0;
+  }
   return std::max(-128.0f, std::min(127.0f, roundf(x)));
 }
 
@@ -185,17 +202,20 @@ void ComputeDC(const ImageF& dc_values, bool fast, int32_t* dc_x,
   *dc_x = FindBestMultiplier(dc_values_yx, dc_values_x, dc_values.xsize(), 0.0f,
                              kDistanceMultiplierDC, fast);
   *dc_b = FindBestMultiplier(dc_values_yb, dc_values_b, dc_values.xsize(),
-                             kYToBRatio, kDistanceMultiplierDC, fast);
+                             jxl::cms::kYToBRatio, kDistanceMultiplierDC, fast);
 }
 
 void ComputeTile(const Image3F& opsin, const DequantMatrices& dequant,
-                 const AcStrategyImage* ac_strategy, const Quantizer* quantizer,
+                 const AcStrategyImage* ac_strategy,
+                 const ImageI* raw_quant_field, const Quantizer* quantizer,
                  const Rect& r, bool fast, bool use_dct8, ImageSB* map_x,
                  ImageSB* map_b, ImageF* dc_values, float* mem) {
   static_assert(kEncTileDimInBlocks == kColorTileDimInBlocks,
                 "Invalid color tile dim");
   size_t xsize_blocks = opsin.xsize() / kBlockDim;
-  constexpr float kDistanceMultiplierAC = 1e-3f;
+  constexpr float kDistanceMultiplierAC = 1e-9f;
+  const size_t dct_scratch_size =
+      3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim;
 
   const size_t y0 = r.y0();
   const size_t x0 = r.x0();
@@ -222,8 +242,10 @@ void ComputeTile(const Image3F& opsin, const DequantMatrices& dequant,
   float* HWY_RESTRICT coeffs_yb = coeffs_x + kColorTileDim * kColorTileDim;
   float* HWY_RESTRICT coeffs_b = coeffs_yb + kColorTileDim * kColorTileDim;
   float* HWY_RESTRICT scratch_space = coeffs_b + kColorTileDim * kColorTileDim;
-  JXL_DASSERT(scratch_space + 2 * AcStrategy::kMaxCoeffArea ==
-              block_y + CfLHeuristics::kItemsPerThread);
+  float* scratch_space_end =
+      scratch_space + 2 * AcStrategy::kMaxCoeffArea + dct_scratch_size;
+  JXL_DASSERT(scratch_space_end == block_y + CfLHeuristics::ItemsPerThread());
+  (void)scratch_space_end;
 
   // Small (~256 bytes each)
   HWY_ALIGN_MAX float
@@ -259,9 +281,6 @@ void ComputeTile(const Image3F& opsin, const DequantMatrices& dequant,
           dequant.InvMatrix(acs.Strategy(), 0);
       const float* const JXL_RESTRICT qm_b =
           dequant.InvMatrix(acs.Strategy(), 2);
-      // Why does a constant seem to work better than
-      // raw_quant_field->Row(y)[x] ?
-      float q = use_dct8 ? 1 : quantizer->Scale() * 400.0f;
       float q_dc_x = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(0);
       float q_dc_b = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(2);
 
@@ -300,6 +319,14 @@ void ComputeTile(const Image3F& opsin, const DequantMatrices& dequant,
           block_b[cx * kBlockDim * iy + ix] = 0;
         }
       }
+      // Unclear why this is like it is. (This works slightly better
+      // than the previous approach which was also a hack.)
+      const float qq =
+          (raw_quant_field == nullptr) ? 1.0f : raw_quant_field->Row(y)[x];
+      // Experimentally values 128-130 seem best -- I don't know why we
+      // need this multiplier.
+      const float kStrangeMultiplier = 128;
+      float q = use_dct8 ? 1 : quantizer->Scale() * kStrangeMultiplier * qq;
       const auto qv = Set(df, q);
       for (size_t i = 0; i < cx * cy * 64; i += Lanes(df)) {
         const auto b_y = Load(df, block_y + i);
@@ -318,8 +345,9 @@ void ComputeTile(const Image3F& opsin, const DequantMatrices& dequant,
   JXL_CHECK(num_ac % Lanes(df) == 0);
   row_out_x[tx] = FindBestMultiplier(coeffs_yx, coeffs_x, num_ac, 0.0f,
                                      kDistanceMultiplierAC, fast);
-  row_out_b[tx] = FindBestMultiplier(coeffs_yb, coeffs_b, num_ac, kYToBRatio,
-                                     kDistanceMultiplierAC, fast);
+  row_out_b[tx] =
+      FindBestMultiplier(coeffs_yb, coeffs_b, num_ac, jxl::cms::kYToBRatio,
+                         kDistanceMultiplierAC, fast);
 }
 
 // NOLINTNEXTLINE(google-readability-namespace-comments)
@@ -344,12 +372,14 @@ void CfLHeuristics::Init(const Image3F& opsin) {
 void CfLHeuristics::ComputeTile(const Rect& r, const Image3F& opsin,
                                 const DequantMatrices& dequant,
                                 const AcStrategyImage* ac_strategy,
+                                const ImageI* raw_quant_field,
                                 const Quantizer* quantizer, bool fast,
                                 size_t thread, ColorCorrelationMap* cmap) {
   bool use_dct8 = ac_strategy == nullptr;
   HWY_DYNAMIC_DISPATCH(ComputeTile)
-  (opsin, dequant, ac_strategy, quantizer, r, fast, use_dct8, &cmap->ytox_map,
-   &cmap->ytob_map, &dc_values, mem.get() + thread * kItemsPerThread);
+  (opsin, dequant, ac_strategy, raw_quant_field, quantizer, r, fast, use_dct8,
+   &cmap->ytox_map, &cmap->ytob_map, &dc_values,
+   mem.get() + thread * ItemsPerThread());
 }
 
 void CfLHeuristics::ComputeDC(bool fast, ColorCorrelationMap* cmap) {
@@ -370,9 +400,10 @@ void ColorCorrelationMapEncodeDC(ColorCorrelationMap* map, BitWriter* writer,
 
   BitWriter::Allotment allotment(writer, 1 + 2 * kBitsPerByte + 12 + 32);
   if (ytox_dc == 0 && ytob_dc == 0 && color_factor == kDefaultColorFactor &&
-      base_correlation_x == 0.0f && base_correlation_b == kYToBRatio) {
+      base_correlation_x == 0.0f &&
+      base_correlation_b == jxl::cms::kYToBRatio) {
     writer->Write(1, 1);
-    ReclaimAndCharge(writer, &allotment, layer, aux_out);
+    allotment.ReclaimAndCharge(writer, layer, aux_out);
     return;
   }
   writer->Write(1, 0);
@@ -381,7 +412,7 @@ void ColorCorrelationMapEncodeDC(ColorCorrelationMap* map, BitWriter* writer,
   JXL_CHECK(F16Coder::Write(base_correlation_b, writer));
   writer->Write(kBitsPerByte, ytox_dc - std::numeric_limits<int8_t>::min());
   writer->Write(kBitsPerByte, ytob_dc - std::numeric_limits<int8_t>::min());
-  ReclaimAndCharge(writer, &allotment, layer, aux_out);
+  allotment.ReclaimAndCharge(writer, layer, aux_out);
 }
 
 }  // namespace jxl