#include <hwy/foreach_target.h>
#include <hwy/highway.h>
-#include "lib/jxl/aux_out.h"
#include "lib/jxl/base/bits.h"
-#include "lib/jxl/base/padded_bytes.h"
-#include "lib/jxl/base/profiler.h"
+#include "lib/jxl/base/common.h"
#include "lib/jxl/base/span.h"
#include "lib/jxl/base/status.h"
-#include "lib/jxl/common.h"
+#include "lib/jxl/cms/opsin_params.h"
#include "lib/jxl/dec_transforms-inl.h"
+#include "lib/jxl/enc_aux_out.h"
+#include "lib/jxl/enc_params.h"
#include "lib/jxl/enc_transforms-inl.h"
#include "lib/jxl/entropy_coder.h"
#include "lib/jxl/image_ops.h"
#include "lib/jxl/modular/encoding/encoding.h"
#include "lib/jxl/quantizer.h"
+#include "lib/jxl/simd_util.h"
HWY_BEFORE_NAMESPACE();
namespace jxl {
namespace HWY_NAMESPACE {
float distance_mul;
};
+// Chroma-from-luma search, values_m will have luma -- and values_s chroma.
int32_t FindBestMultiplier(const float* values_m, const float* values_s,
size_t num, float base, float distance_mul,
bool fast) {
x = -GetLane(SumOfLanes(df, cb)) /
(GetLane(SumOfLanes(df, ca)) + num * distance_mul * 0.5f);
} else {
- constexpr float eps = 1;
+ constexpr float eps = 100;
constexpr float kClamp = 20.0f;
CFLFunction fn(values_m, values_s, num, base, distance_mul);
x = 0;
float dfpeps, dfmeps;
float df = fn.Compute(x, eps, &dfpeps, &dfmeps);
float ddf = (dfpeps - dfmeps) / (2 * eps);
- float step = df / ddf;
+ float kExperimentalInsignificantStabilizer = 0.85;
+ float step = df / (ddf + kExperimentalInsignificantStabilizer);
x -= std::min(kClamp, std::max(-kClamp, step));
if (std::abs(step) < 3e-3) break;
}
}
+ // CFL seems to be tricky for larger transforms for HF components
+ // close to zero. This heuristic brings the solutions closer to zero
+ // and reduces red-green oscillations. A better approach would
+ // look into variance of the multiplier within separate (e.g. 8x8)
+ // areas and only apply this heuristic where there is a high variance.
+ // This would give about 1 % more compression density.
+ float towards_zero = 2.6;
+ if (x >= towards_zero) {
+ x -= towards_zero;
+ } else if (x <= -towards_zero) {
+ x += towards_zero;
+ } else {
+ x = 0;
+ }
return std::max(-128.0f, std::min(127.0f, roundf(x)));
}
*dc_x = FindBestMultiplier(dc_values_yx, dc_values_x, dc_values.xsize(), 0.0f,
kDistanceMultiplierDC, fast);
*dc_b = FindBestMultiplier(dc_values_yb, dc_values_b, dc_values.xsize(),
- kYToBRatio, kDistanceMultiplierDC, fast);
+ jxl::cms::kYToBRatio, kDistanceMultiplierDC, fast);
}
void ComputeTile(const Image3F& opsin, const DequantMatrices& dequant,
- const AcStrategyImage* ac_strategy, const Quantizer* quantizer,
+ const AcStrategyImage* ac_strategy,
+ const ImageI* raw_quant_field, const Quantizer* quantizer,
const Rect& r, bool fast, bool use_dct8, ImageSB* map_x,
ImageSB* map_b, ImageF* dc_values, float* mem) {
static_assert(kEncTileDimInBlocks == kColorTileDimInBlocks,
"Invalid color tile dim");
size_t xsize_blocks = opsin.xsize() / kBlockDim;
- constexpr float kDistanceMultiplierAC = 1e-3f;
+ constexpr float kDistanceMultiplierAC = 1e-9f;
+ const size_t dct_scratch_size =
+ 3 * (MaxVectorSize() / sizeof(float)) * AcStrategy::kMaxBlockDim;
const size_t y0 = r.y0();
const size_t x0 = r.x0();
float* HWY_RESTRICT coeffs_yb = coeffs_x + kColorTileDim * kColorTileDim;
float* HWY_RESTRICT coeffs_b = coeffs_yb + kColorTileDim * kColorTileDim;
float* HWY_RESTRICT scratch_space = coeffs_b + kColorTileDim * kColorTileDim;
- JXL_DASSERT(scratch_space + 2 * AcStrategy::kMaxCoeffArea ==
- block_y + CfLHeuristics::kItemsPerThread);
+ float* scratch_space_end =
+ scratch_space + 2 * AcStrategy::kMaxCoeffArea + dct_scratch_size;
+ JXL_DASSERT(scratch_space_end == block_y + CfLHeuristics::ItemsPerThread());
+ (void)scratch_space_end;
// Small (~256 bytes each)
HWY_ALIGN_MAX float
dequant.InvMatrix(acs.Strategy(), 0);
const float* const JXL_RESTRICT qm_b =
dequant.InvMatrix(acs.Strategy(), 2);
- // Why does a constant seem to work better than
- // raw_quant_field->Row(y)[x] ?
- float q = use_dct8 ? 1 : quantizer->Scale() * 400.0f;
float q_dc_x = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(0);
float q_dc_b = use_dct8 ? 1 : 1.0f / quantizer->GetInvDcStep(2);
block_b[cx * kBlockDim * iy + ix] = 0;
}
}
+ // Unclear why this is like it is. (This works slightly better
+ // than the previous approach which was also a hack.)
+ const float qq =
+ (raw_quant_field == nullptr) ? 1.0f : raw_quant_field->Row(y)[x];
+ // Experimentally values 128-130 seem best -- I don't know why we
+ // need this multiplier.
+ const float kStrangeMultiplier = 128;
+ float q = use_dct8 ? 1 : quantizer->Scale() * kStrangeMultiplier * qq;
const auto qv = Set(df, q);
for (size_t i = 0; i < cx * cy * 64; i += Lanes(df)) {
const auto b_y = Load(df, block_y + i);
JXL_CHECK(num_ac % Lanes(df) == 0);
row_out_x[tx] = FindBestMultiplier(coeffs_yx, coeffs_x, num_ac, 0.0f,
kDistanceMultiplierAC, fast);
- row_out_b[tx] = FindBestMultiplier(coeffs_yb, coeffs_b, num_ac, kYToBRatio,
- kDistanceMultiplierAC, fast);
+ row_out_b[tx] =
+ FindBestMultiplier(coeffs_yb, coeffs_b, num_ac, jxl::cms::kYToBRatio,
+ kDistanceMultiplierAC, fast);
}
// NOLINTNEXTLINE(google-readability-namespace-comments)
void CfLHeuristics::ComputeTile(const Rect& r, const Image3F& opsin,
const DequantMatrices& dequant,
const AcStrategyImage* ac_strategy,
+ const ImageI* raw_quant_field,
const Quantizer* quantizer, bool fast,
size_t thread, ColorCorrelationMap* cmap) {
bool use_dct8 = ac_strategy == nullptr;
HWY_DYNAMIC_DISPATCH(ComputeTile)
- (opsin, dequant, ac_strategy, quantizer, r, fast, use_dct8, &cmap->ytox_map,
- &cmap->ytob_map, &dc_values, mem.get() + thread * kItemsPerThread);
+ (opsin, dequant, ac_strategy, raw_quant_field, quantizer, r, fast, use_dct8,
+ &cmap->ytox_map, &cmap->ytob_map, &dc_values,
+ mem.get() + thread * ItemsPerThread());
}
void CfLHeuristics::ComputeDC(bool fast, ColorCorrelationMap* cmap) {
BitWriter::Allotment allotment(writer, 1 + 2 * kBitsPerByte + 12 + 32);
if (ytox_dc == 0 && ytob_dc == 0 && color_factor == kDefaultColorFactor &&
- base_correlation_x == 0.0f && base_correlation_b == kYToBRatio) {
+ base_correlation_x == 0.0f &&
+ base_correlation_b == jxl::cms::kYToBRatio) {
writer->Write(1, 1);
- ReclaimAndCharge(writer, &allotment, layer, aux_out);
+ allotment.ReclaimAndCharge(writer, layer, aux_out);
return;
}
writer->Write(1, 0);
JXL_CHECK(F16Coder::Write(base_correlation_b, writer));
writer->Write(kBitsPerByte, ytox_dc - std::numeric_limits<int8_t>::min());
writer->Write(kBitsPerByte, ytob_dc - std::numeric_limits<int8_t>::min());
- ReclaimAndCharge(writer, &allotment, layer, aux_out);
+ allotment.ReclaimAndCharge(writer, layer, aux_out);
}
} // namespace jxl